chore: Remove deprecated components/metrics and references (#3475)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>

chore: Remove deprecated components/metrics and references (#3475)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com>
be001a58 · Keiven C · GitHub · f712653e · be001a58 · be001a58
Unverified Commit be001a58 authored Oct 09, 2025 by Keiven C Committed by GitHub Oct 09, 2025
11 changed files
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4551,25 +4551,6 @@ dependencies = [
 "paste",
 ]
-[[package]]
-name = "metrics"
-version = "0.5.1"
-dependencies = [
- "axum 0.8.4",
- "clap 4.5.48",
- "dynamo-llm",
- "dynamo-runtime",
- "futures",
- "prometheus",
- "rand 0.9.2",
- "reqwest 0.12.23",
- "serde",
- "serde_json",
- "thiserror 2.0.16",
- "tokio",
- "tracing",
-]
 [[package]]
 name = "mime"
 version = "0.3.17"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,6 @@
 [workspace]
 members = [
-    "components/metrics",
    "launch/dynamo-run",
    "lib/llm",
    "lib/runtime",
@@ -18,7 +17,6 @@ members = [
 # - launch/dynamo-run
 # - lib/engines/*
 default-members = [
-    "components/metrics",
    "lib/llm",
    "lib/runtime",
    "lib/tokens",

--- a/components/metrics/Cargo.toml
+++ b/components/metrics/Cargo.toml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-[package]
-name = "metrics"
-version.workspace = true
-edition.workspace = true
-authors.workspace = true
-license.workspace = true
-homepage.workspace = true
-repository.workspace = true
-[dependencies]
-dynamo-llm = { workspace = true }
-dynamo-runtime = { workspace = true }
-futures = { workspace = true }
-prometheus = { workspace = true }
-rand = { workspace = true }
-serde = { workspace = true }
-serde_json = { workspace = true }
-thiserror = { workspace = true }
-tokio = { workspace = true }
-tracing = { workspace = true }
-axum = { version = "0.8" }
-clap = { version = "4.5", features = ["derive", "env"] }
-reqwest = { version = "0.12.22", default-features = false, features = ["json", "rustls-tls"] }
--- a/components/metrics/README.md
+++ b/components/metrics/README.md
-# Metrics
-⚠️ **DEPRECATION NOTICE** ⚠️
-**This `metrics` component is unmaintained and being deprecated.**
-The deprecated `metrics` component is being replaced by the **`MetricsRegistry`** built-in functionality that is now available directly in the `DistributedRuntime` framework. The `MetricsRegistry` provides:
-**For new projects and existing deployments, please migrate to using `MetricsRegistry` instead of this component.**
-This component may be migrated to the MetricsRegistry in the future.
-**📖 See the [Dynamo MetricsRegistry Guide](../../docs/guides/metrics.md) for detailed information on using the new metrics system.**
---
-The deprecated `metrics` component is a utility for collecting, aggregating, and publishing metrics from a Dynamo deployment, but it is unmaintained and being deprecated in favor of `MetricsRegistry`.
-**Note**: This is a demo implementation. The deprecated `metrics` component is no longer under active development.
- In this demo the metrics names use the prefix "llm", but in production they will be prefixed with "dynamo" (e.g., the HTTP `/metrics` endpoint will serve metrics with "dynamo" prefixes)
-<div align="center">
-  <img src="images/dynamo_metrics_grafana.png" alt="Dynamo Metrics Dashboard"/>
-</div>
-## Quickstart
-To start the deprecated `metrics` component, simply point it at the `namespace/component/endpoint`
-trio for the Dynamo workers that you're interested in monitoring metrics on.
-This will:
-1. Collect statistics from workers associated with that `namespace/component/endpoint`
-2. Postprocess and aggregate those statistics across the workers
-3. Publish them on a Prometheus-compatible metrics endpoint
-For example:
-```bash
-# Default namespace is "dynamo", but can be configured with --namespace
-# For more detailed output, try setting the env var: DYN_LOG=debug
-metrics --component MyComponent --endpoint my_endpoint
-# 2025-03-17T00:07:05.202558Z  INFO metrics: Scraping endpoint dynamo/MyComponent/my_endpoint for stats
-# 2025-03-17T00:07:05.202955Z  INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics
-# ...
-```
-With no matching endpoints running to collect stats from, you should see warnings in the logs:
-```bash
-2025-03-17T00:07:06.204756Z  WARN metrics: No endpoints found matching dynamo/MyComponent/my_endpoint
-```
-After a worker with a matching endpoint gets started, the endpoint
-will get automatically discovered and the warnings will stop.
-## Workers
-The deprecated `metrics` component needs running workers to gather metrics from,
-so below are some examples of workers and how they can be monitored.
-### Mock Worker
-To try out how the deprecated `metrics` component works, there is a demo Rust-based
-[mock worker](src/bin/mock_worker.rs) that provides sample data through two mechanisms:
-1. Exposes a stats handler at `dynamo/MyComponent/my_endpoint` that responds to polling requests (from the deprecated `metrics` component) with randomly generated `ForwardPassMetrics` data
-2. Publishes mock `KVHitRateEvent` data every second to demonstrate event-based metrics
-Step 1: Launch a mock workers via the following command (if already built):
-```bash
-# or build/run from source: DYN_LOG=DEBUG cargo run --bin mock_worker
-mock_worker
-# 2025-03-16T23:49:28.101668Z  INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/MyComponent/my_endpoint
-```
-Step 2: Monitor the metrics of these mock workers, and prepare its own Prometheus endpoint at
-port 9091 (a default, when --port is not specified) on /metrics:
-```bash
-metrics --component MyComponent --endpoint my_endpoint
-```
-### Real Worker
-To run a more realistic deployment to gather metrics:
-```bash
-python -m dynamo.frontend &
-python -m dynamo.vllm --model-path <your-model-checkout>
-```
-Then, to monitor the metrics of these VllmWorkers, run:
-```bash
-metrics --component backend --endpoint load_metrics
-```
-**NOTE**: `load_metrics` is currently a
-[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108)
-endpoint name used for python-based workers that register a `WorkerMetricsPublisher`.
-## Visualization
-To visualize the metrics being exposed on the Prometheus endpoint,
-see the Prometheus and Grafana configurations in
-[deploy/metrics](../../deploy/metrics):
-```bash
-docker compose -f deploy/docker-compose.yml --profile metrics up -d
-```
-## Metrics Collection Modes
-The deprecated `metrics` component supports two modes for exposing metrics in a Prometheus format:
-### Pull Mode (Default)
-When running in pull mode (the default), the deprecated `metrics` component will expose a
-Prometheus metrics endpoint on the specified host and port that a
-Prometheus server or curl client can pull from:
-```bash
-# Start metrics server on default host (0.0.0.0) and port (9091)
-metrics --component MyComponent --endpoint my_endpoint
-# Or specify a custom port
-metrics --component MyComponent --endpoint my_endpoint --port 9092
-```
-In pull mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
- The `--port` parameter specifies which port the HTTP server will listen on
-You can then query the metrics using:
-```bash
-curl localhost:9091/metrics
-# # HELP llm_kv_blocks_active Active KV cache blocks
-# # TYPE llm_kv_blocks_active gauge
-# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 40
-# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 2
-# # HELP llm_kv_blocks_total Total KV cache blocks
-# # TYPE llm_kv_blocks_total gauge
-# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 100
-# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 100
-```
-### Push Mode
-For ephemeral or batch jobs, or when metrics need to be pushed through a firewall,
-you can use Push mode. In this mode, the deprecated `metrics` component will periodically push
-metrics to an externally hosted
-[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/):
-Start a prometheus push gateway service via docker:
-```bash
-docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway
-```
-Start the deprecated `metrics` component in `--push` mode, specifying the host and port of your PushGateway:
-```bash
-# Push metrics to a Prometheus PushGateway every --push-interval seconds
-metrics \
-    --component MyComponent \
-    --endpoint my_endpoint \
-    --host 127.0.0.1 \
-    --port 9091 \
-    --push
-```
-When using Push mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
-  that the Prometheus PushGateway is running on
- The `--port` parameter specifies the port of the Prometheus PushGateway
- The push interval can be configured with `--push-interval` (default: 2 seconds)
- A default job name of "dynamo_metrics" is used for the Prometheus job label
- Metrics persist in the PushGateway until explicitly deleted
- Prometheus should be configured to scrape the PushGateway with `honor_labels: true`
-To view the metrics hosted on the PushGateway:
-```bash
-# View all metrics
-# curl http://<pushgateway_ip>:<pushgateway_port>/metrics
-curl 127.0.0.1:9091/metrics
-```
-## Building/Running from Source
-For easy iteration while making edits to the deprecated `metrics` component, you can use `cargo run`
-to build and run with your local changes:
-```bash
-cargo run --bin metrics -- --component MyComponent --endpoint my_endpoint
-```
--- a/components/metrics/images/dynamo_metrics_grafana.png
+++ b/components/metrics/images/dynamo_metrics_grafana.png
--- a/components/metrics/src/bin/mock_worker.rs
+++ b/components/metrics/src/bin/mock_worker.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-use dynamo_llm::kv_router::{
-    KV_HIT_RATE_SUBJECT,
-    protocols::{ForwardPassMetrics, KvStats, WorkerStats},
-    scheduler::KVHitRateEvent,
-};
-use dynamo_runtime::{
-    DistributedRuntime, Result, Runtime, Worker,
-    component::{Namespace, service::EndpointStats},
-    logging,
-    pipeline::{
-        AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn,
-        async_trait, network::Ingress,
-    },
-    protocols::annotated::Annotated,
-    stream,
-    traits::events::EventPublisher,
-};
-use rand::Rng;
-use std::sync::Arc;
-use tokio::time::{Duration, interval};
-fn main() -> Result<()> {
-    logging::init();
-    let worker = Worker::from_settings()?;
-    worker.execute(app)
-}
-async fn app(runtime: Runtime) -> Result<()> {
-    let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
-    backend(distributed).await
-}
-struct MockRequestHandler {}
-impl MockRequestHandler {
-    fn new() -> Arc<Self> {
-        Arc::new(Self {})
-    }
-}
-#[async_trait]
-impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MockRequestHandler {
-    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
-        let (data, ctx) = input.into_parts();
-        let chars = data
-            .chars()
-            .map(|c| Annotated::from_data(c.to_string()))
-            .collect::<Vec<_>>();
-        let stream = stream::iter(chars);
-        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
-    }
-}
-// FIXME: These events are just for testing and may not currently be used.
-/// Spawns a background task that periodically publishes mock KV hit rate events
-async fn mock_event_publisher(namespace: Namespace) {
-    // NOTE: These events are just for testing, and shouldn't be interpreted
-    // in correlation with the stats handler's data:
-    // 1. The worker ID associated with the events here won't match the
-    // worker ID of the endpoint's service stats handler.
-    // 2. These events aren't coming through the KV Router, so the metrics won't
-    // be reflective of the KV Router's performance.
-    // 3. The data in these events aren't in sync with the stats handler's
-    // ForwardPassMetrics data, so they may not correlate well.
-    let worker_id = rand::rng().random_range(1..=1000);
-    let mut interval = interval(Duration::from_secs(1));
-    loop {
-        interval.tick().await;
-        // Generate random KV hit rate event using a new thread_rng each time
-        let isl_blocks = rand::rng().random_range(0..=100);
-        let overlap_blocks = rand::rng().random_range(0..=isl_blocks);
-        let event = KVHitRateEvent {
-            worker_id,
-            isl_blocks,
-            overlap_blocks: overlap_blocks as u32,
-        };
-        if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
-            tracing::warn!("Failed to publish KV hit rate event: {e}");
-        } else {
-            tracing::debug!(
-                "Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%",
-                (overlap_blocks as f64 / isl_blocks as f64) * 100.0
-            );
-        }
-    }
-}
-/// Generates mock forward pass metrics for stats handler
-fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
-    let request_total_slots = 100;
-    let request_active_slots = rand::rng().random_range(0..=request_total_slots);
-    let kv_total_blocks = 100;
-    let kv_active_blocks = rand::rng().random_range(0..=kv_total_blocks);
-    let num_requests_waiting = rand::rng().random_range(0..=100);
-    let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
-    let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
-    let worker_stats = WorkerStats {
-        data_parallel_rank: None, // Default for backwards compatibility
-        request_active_slots,
-        request_total_slots,
-        num_requests_waiting,
-    };
-    let kv_stats = KvStats {
-        kv_active_blocks,
-        kv_total_blocks,
-        gpu_cache_usage_perc,
-        gpu_prefix_cache_hit_rate,
-    };
-    let spec_decode_stats = None;
-    let stats = ForwardPassMetrics {
-        worker_stats,
-        kv_stats,
-        spec_decode_stats,
-    };
-    tracing::info!("Stats: {stats:?}");
-    serde_json::to_value(stats).unwrap()
-}
-async fn backend(runtime: DistributedRuntime) -> Result<()> {
-    let namespace = runtime.namespace("dynamo")?;
-    // we must first create a service, then we can attach one more more endpoints
-    let component = namespace
-        .component("MyComponent")?
-        .service_builder()
-        .create()
-        .await?;
-    let endpoint = component.endpoint("my_endpoint");
-    tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path());
-    // Spawn background task for publishing KV hit rate events
-    let namespace_clone = namespace.clone();
-    tokio::spawn(async move {
-        mock_event_publisher(namespace_clone).await;
-    });
-    // Attach an ingress to the engine
-    let ingress = Ingress::for_engine(MockRequestHandler::new())?;
-    // Make the ingress discoverable via a component service
-    endpoint
-        .endpoint_builder()
-        // Dummy stats handler to demonstrate how to attach a custom stats handler
-        .stats_handler(mock_stats_handler)
-        .handler(ingress)
-        .start()
-        .await
-}
--- a/components/metrics/src/lib.rs
+++ b/components/metrics/src/lib.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//! Library functions for the metrics application.
-//!
-//! This library provides functionality to expose Prometheus metrics either through a local HTTP server
-//! or by pushing to a Prometheus PushGateway.
-//!
-//! # Examples
-//!
-//! ## Using the metrics pull mode
-//! ```no_run
-//! use metrics::{PrometheusMetricsCollector, MetricsMode};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let mut collector = PrometheusMetricsCollector::new()?;
-//!
-//!     // Start a metrics server with default values
-//!     collector.start(MetricsMode::default())?;
-//!
-//!     // Or explicitly specify values
-//!     collector.start(MetricsMode::Pull {
-//!         host: "127.0.0.1".to_string(),
-//!         port: 9090,
-//!     })?;
-//!
-//!     // Or use the convenience constructor
-//!     collector.start(MetricsMode::new_pull())?;
-//!
-//!     // Your application code here
-//!     tokio::signal::ctrl_c().await?;
-//!
-//!     // Stop the metrics server gracefully
-//!     collector.stop();
-//!     Ok(())
-//! }
-//! ```
-//!
-//! ## Using the Push mode
-//! ```no_run
-//! use metrics::{PrometheusMetricsCollector, MetricsMode};
-//!
-//! #[tokio::main]
-//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
-//!     let mut collector = PrometheusMetricsCollector::new()?;
-//!
-//!     // Start pushing metrics to a Prometheus PushGateway with default values
-//!     collector.start(MetricsMode::new_push())?;
-//!
-//!     // Or explicitly specify values
-//!     collector.start(MetricsMode::Push {
-//!         host: "127.0.0.1".to_string(),
-//!         port: 9091,
-//!         job: "custom_job".to_string(),
-//!         interval: 5, // Push every 5 seconds
-//!     })?;
-//!
-//!     // Your application code here
-//!     tokio::signal::ctrl_c().await?;
-//!
-//!     // Stop pushing metrics gracefully
-//!     collector.stop();
-//!     Ok(())
-//! }
-use axum::{Router, routing::get};
-use prometheus::{Encoder, TextEncoder, register_counter_vec, register_gauge_vec};
-use reqwest::Client;
-use serde::{Deserialize, Serialize};
-use std::net::SocketAddr;
-use std::time::Duration as StdDuration;
-use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics};
-use dynamo_llm::kv_router::scoring::Endpoint;
-use dynamo_llm::kv_router::scoring::ProcessedEndpoints;
-use dynamo_runtime::{
-    Result, distributed::Component, error, service::EndpointInfo, utils::Duration,
-};
-/// Configuration for metrics collection mode
-#[derive(Debug, Clone)]
-pub enum MetricsMode {
-    /// Host a Prometheus metrics server for pull-based collection
-    Pull {
-        /// Host to listen on (e.g. "0.0.0.0")
-        host: String,
-        /// Port to listen on (e.g. 9091)
-        port: u16,
-    },
-    /// Push to a Prometheus PushGateway
-    Push {
-        /// PushGateway host (e.g. "http://localhost")
-        host: String,
-        /// PushGateway port (e.g. 9091)
-        port: u16,
-        /// Job name for the metrics
-        job: String,
-        /// Push interval in seconds
-        interval: u64,
-    },
-}
-impl Default for MetricsMode {
-    fn default() -> Self {
-        Self::new_pull()
-    }
-}
-impl MetricsMode {
-    /// Create a new Pull mode with default values
-    pub fn new_pull() -> Self {
-        Self::Pull {
-            host: "0.0.0.0".to_string(),
-            port: 9091,
-        }
-    }
-    /// Create a new Push mode with default values
-    pub fn new_push() -> Self {
-        Self::Push {
-            host: "127.0.0.1".to_string(),
-            port: 9091,
-            job: "dynamo_metrics".to_string(),
-            interval: 2,
-        }
-    }
-}
-/// Configuration for LLM worker load capacity metrics
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LLMWorkerLoadCapacityConfig {
-    pub component_name: String,
-    pub endpoint_name: String,
-    pub model_name: Option<String>,
-}
-/// Metrics collector for exposing metrics to prometheus/grafana
-pub struct PrometheusMetricsCollector {
-    metrics: PrometheusMetrics,
-    mode: Option<MetricsMode>,
-    shutdown_tx: Option<tokio::sync::oneshot::Sender<()>>,
-}
-impl PrometheusMetricsCollector {
-    pub fn new() -> Result<Self> {
-        Ok(Self {
-            metrics: PrometheusMetrics::new()?,
-            mode: None,
-            shutdown_tx: None,
-        })
-    }
-    /// Start metrics collection with the specified mode
-    pub fn start(&mut self, mode: MetricsMode) -> Result<()> {
-        // Store the mode
-        self.mode = Some(mode.clone());
-        match mode {
-            MetricsMode::Pull { host, port } => self.start_pull_mode(host, port),
-            MetricsMode::Push {
-                host,
-                port,
-                job,
-                interval,
-            } => self.start_push_mode(host, port, job, interval),
-        }
-    }
-    /// Stop metrics collection
-    pub fn stop(&mut self) {
-        if let Some(tx) = self.shutdown_tx.take() {
-            let _ = tx.send(());
-        }
-    }
-    /// Start a metrics server for pull-based collection on the specified port
-    fn start_pull_mode(&mut self, host: String, port: u16) -> Result<()> {
-        // Create an axum router with a metrics endpoint
-        let app = Router::new().route(
-            "/metrics",
-            get(|| async {
-                // Gather and encode metrics
-                let encoder = TextEncoder::new();
-                let mut buffer = Vec::new();
-                encoder.encode(&prometheus::gather(), &mut buffer).unwrap();
-                String::from_utf8(buffer).unwrap()
-            }),
-        );
-        // Create a socket address to listen on
-        let ip_addr = host.parse().map_err(|e| {
-            error!("Failed to parse host '{}' as IP address: {}. Use a valid IPv4 or IPv6 address (e.g. '0.0.0.0' or '127.0.0.1')", host, e)
-        })?;
-        let addr = SocketAddr::new(ip_addr, port);
-        // Create shutdown channel
-        let (tx, rx) = tokio::sync::oneshot::channel();
-        self.shutdown_tx = Some(tx);
-        // Spawn the server in a background task
-        tokio::spawn(async move {
-            let listener = tokio::net::TcpListener::bind(addr)
-                .await
-                .unwrap_or_else(|_| panic!("could not bind to address: {addr}"));
-            let server = axum::serve(listener, app);
-            // Create a future that completes when shutdown signal is received
-            let shutdown_future = async {
-                rx.await.ok();
-            };
-            // Run the server with graceful shutdown
-            tokio::select! {
-                result = server => {
-                    if let Err(e) = result {
-                        tracing::error!("Metrics server error: {}", e);
-                    }
-                },
-                _ = shutdown_future => {
-                    tracing::info!("Metrics server shutting down gracefully");
-                },
-            }
-        });
-        tracing::info!("Prometheus metrics server started at {addr}/metrics");
-        Ok(())
-    }
-    /// Start pushing metrics to a Prometheus PushGateway
-    fn start_push_mode(
-        &mut self,
-        host: String,
-        port: u16,
-        job: String,
-        interval: u64,
-    ) -> Result<()> {
-        // Create shutdown channel
-        let (tx, mut rx) = tokio::sync::oneshot::channel();
-        self.shutdown_tx = Some(tx);
-        // Create HTTP client
-        let client = Client::new();
-        let url = format!("http://{host}:{port}/metrics/job/{job}");
-        let url_clone = url.clone();
-        let interval_duration = StdDuration::from_secs(interval);
-        // Spawn background task to periodically push metrics
-        tokio::spawn(async move {
-            let mut interval = tokio::time::interval(interval_duration);
-            loop {
-                tokio::select! {
-                    _ = interval.tick() => {
-                        // Gather and encode metrics
-                        let encoder = TextEncoder::new();
-                        let mut buffer = Vec::new();
-                        if let Err(e) = encoder.encode(&prometheus::gather(), &mut buffer) {
-                            tracing::error!("Failed to encode metrics: {}", e);
-                            continue;
-                        }
-                        // Push metrics to the gateway
-                        match client.post(&url)
-                            .header("Content-Type", encoder.format_type())
-                            .body(buffer)
-                            .send()
-                            .await
-                        {
-                            Ok(response) => {
-                                if response.status().is_success() {
-                                    tracing::debug!("Successfully pushed metrics to PushGateway");
-                                } else {
-                                    tracing::error!(
-                                        "Failed to push metrics to PushGateway. Status: {}, Error: {:?}",
-                                        response.status(),
-                                        response.text().await
-                                    );
-                                }
-                            }
-                            Err(e) => {
-                                tracing::error!("Failed to push metrics to PushGateway: {}", e);
-                            }
-                        }
-                    }
-                    _ = &mut rx => {
-                        tracing::info!("Stopping metrics push task");
-                        break;
-                    }
-                }
-            }
-        });
-        tracing::info!(
-            "Started pushing metrics to PushGateway at '{url_clone}' with job name '{job}'"
-        );
-        Ok(())
-    }
-    /// Update metrics with current values
-    pub fn update(&mut self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
-        self.metrics.update(config, processed);
-    }
-    /// Update KV hit rate metrics
-    pub fn update_kv_hit_rate(
-        &mut self,
-        config: &LLMWorkerLoadCapacityConfig,
-        worker_id: i64,
-        isl_blocks: usize,
-        overlap_blocks: usize,
-    ) {
-        self.metrics
-            .update_kv_hit_rate(config, worker_id, isl_blocks, overlap_blocks);
-    }
-}
-/// Prometheus metrics collection
-pub struct PrometheusMetrics {
-    kv_blocks_active: prometheus::GaugeVec,
-    kv_blocks_total: prometheus::GaugeVec,
-    requests_active: prometheus::GaugeVec,
-    requests_total: prometheus::GaugeVec,
-    load_avg: prometheus::GaugeVec,
-    load_std: prometheus::GaugeVec,
-    // KV hit rate metrics
-    kv_hit_rate_percent: prometheus::GaugeVec,
-    // FIXME: These are currently unused outside of mock_worker
-    kv_hit_rate_isl_blocks: prometheus::CounterVec,
-    kv_hit_rate_overlap_blocks: prometheus::CounterVec,
-}
-impl PrometheusMetrics {
-    /// Initialize all metrics
-    fn new() -> Result<Self> {
-        Ok(Self {
-            kv_blocks_active: register_gauge_vec!(
-                "llm_kv_blocks_active",
-                "Active KV cache blocks",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            kv_blocks_total: register_gauge_vec!(
-                "llm_kv_blocks_total",
-                "Total KV cache blocks",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            requests_active: register_gauge_vec!(
-                "llm_requests_active_slots",
-                "Active request slots",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            requests_total: register_gauge_vec!(
-                "llm_requests_total_slots",
-                "Total request slots",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            load_avg: register_gauge_vec!(
-                "llm_load_avg",
-                "Average load across workers",
-                &["component", "endpoint"]
-            )?,
-            load_std: register_gauge_vec!(
-                "llm_load_std",
-                "Load standard deviation across workers",
-                &["component", "endpoint"]
-            )?,
-            // KV hit rate (ForwardPassMetrics)
-            kv_hit_rate_percent: register_gauge_vec!(
-                "llm_kv_hit_rate_percent",
-                "KV hit rate percentage per worker",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            // FIXME: Cleanup/remove event based metrics after finalizaing
-            //        metrics collection approach with vllm/trtllm workers.
-            // Event-based KV hit rate metrics (not currently used outside mock worker)
-            kv_hit_rate_isl_blocks: register_counter_vec!(
-                "llm_kv_hit_rate_isl_blocks",
-                "Cumulative count of ISL blocks in KV hit rate events",
-                &["component", "endpoint", "worker_id"]
-            )?,
-            kv_hit_rate_overlap_blocks: register_counter_vec!(
-                "llm_kv_hit_rate_overlap_blocks",
-                "Cumulative count of overlapping blocks in KV hit rate events",
-                &["component", "endpoint", "worker_id"]
-            )?,
-        })
-    }
-    /// Helper method to set a gauge with worker-specific labels (3 labels)
-    fn set_worker_gauge(
-        &self,
-        gauge: &prometheus::GaugeVec,
-        config: &LLMWorkerLoadCapacityConfig,
-        worker_id: &String,
-        value: f64,
-    ) {
-        gauge
-            .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
-            .set(value);
-    }
-    /// Helper method to increment a counter with worker-specific labels (3 labels)
-    fn increment_worker_counter(
-        &self,
-        counter: &prometheus::CounterVec,
-        config: &LLMWorkerLoadCapacityConfig,
-        worker_id: &String,
-        value: f64,
-    ) {
-        counter
-            .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
-            .inc_by(value);
-    }
-    /// Helper method to set a gauge with component/endpoint labels only (2 labels)
-    fn set_endpoint_gauge(
-        &self,
-        gauge: &prometheus::GaugeVec,
-        config: &LLMWorkerLoadCapacityConfig,
-        value: f64,
-    ) {
-        gauge
-            .with_label_values(&[&config.component_name, &config.endpoint_name])
-            .set(value);
-    }
-    /// Update metrics with current values
-    fn update(&self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
-        // Update per-worker metrics
-        for (worker_id, endpoint) in processed.endpoints.iter() {
-            let worker_id = worker_id.to_string();
-            let load_metrics = endpoint.data.clone();
-            let LoadMetrics::EngineLoadMetrics(metrics) = load_metrics else {
-                panic!("Can only update with ForwardPassMetrics");
-            };
-            self.set_worker_gauge(
-                &self.kv_blocks_active,
-                config,
-                &worker_id,
-                metrics.kv_stats.kv_active_blocks as f64,
-            );
-            self.set_worker_gauge(
-                &self.kv_blocks_total,
-                config,
-                &worker_id,
-                metrics.kv_stats.kv_total_blocks as f64,
-            );
-            self.set_worker_gauge(
-                &self.requests_active,
-                config,
-                &worker_id,
-                metrics.worker_stats.request_active_slots as f64,
-            );
-            self.set_worker_gauge(
-                &self.requests_total,
-                config,
-                &worker_id,
-                metrics.worker_stats.request_total_slots as f64,
-            );
-            self.set_worker_gauge(
-                &self.kv_hit_rate_percent,
-                config,
-                &worker_id,
-                metrics.kv_stats.gpu_prefix_cache_hit_rate as f64,
-            );
-        }
-        // Update aggregate metrics
-        self.set_endpoint_gauge(&self.load_avg, config, processed.load_avg);
-        self.set_endpoint_gauge(&self.load_std, config, processed.load_std);
-    }
-    /// Update KV hit rate metrics
-    pub fn update_kv_hit_rate(
-        &self,
-        config: &LLMWorkerLoadCapacityConfig,
-        worker_id: i64,
-        isl_blocks: usize,
-        overlap_blocks: usize,
-    ) {
-        let worker_id_str = worker_id.to_string();
-        // Increment the ISL blocks and overlap blocks counters
-        self.increment_worker_counter(
-            &self.kv_hit_rate_isl_blocks,
-            config,
-            &worker_id_str,
-            isl_blocks as f64,
-        );
-        self.increment_worker_counter(
-            &self.kv_hit_rate_overlap_blocks,
-            config,
-            &worker_id_str,
-            overlap_blocks as f64,
-        );
-        // TODO: The cumulative hit rate percentage can probably be computed by consumers
-        // of Prometheus metrics like Grafana instead, but we'll compute it here for now
-        // for convenient debugging/logging.
-        // Calculate and set the cumulative hit rate percentage
-        let cumulative_isl = self
-            .kv_hit_rate_isl_blocks
-            .with_label_values(&[
-                &config.component_name,
-                &config.endpoint_name,
-                &worker_id_str,
-            ])
-            .get();
-        let cumulative_overlap = self
-            .kv_hit_rate_overlap_blocks
-            .with_label_values(&[
-                &config.component_name,
-                &config.endpoint_name,
-                &worker_id_str,
-            ])
-            .get();
-        if cumulative_isl > 0.0 {
-            let cumulative_hit_rate = (cumulative_overlap / cumulative_isl) * 100.0;
-            tracing::debug!(
-                "Estimated Cumulative KV hit rate: {cumulative_hit_rate:.2}% (Overlap: {cumulative_overlap} / ISL: {cumulative_isl})"
-            );
-        }
-    }
-}
-/// Collect endpoints from a component
-pub async fn collect_endpoints(
-    component: &Component,
-    subject: &str,
-    timeout: Duration,
-) -> Result<Vec<EndpointInfo>> {
-    // Collect stats from each backend
-    let stream = component.scrape_stats(timeout).await?;
-    // Filter the stats by the service subject
-    let endpoints = stream
-        .into_endpoints()
-        .filter(|e| e.subject.starts_with(subject))
-        .collect::<Vec<_>>();
-    tracing::debug!("Endpoints: {endpoints:?}");
-    Ok(endpoints)
-}
-/// Extract metrics from endpoints
-pub fn extract_metrics(endpoints: &[EndpointInfo]) -> Vec<ForwardPassMetrics> {
-    let endpoint_data = endpoints.iter().map(|e| e.data.clone()).collect::<Vec<_>>();
-    // Extract ForwardPassMetrics objects from endpoint services
-    let metrics: Vec<ForwardPassMetrics> = endpoint_data
-        .iter()
-        .filter_map(|e| {
-            let metrics_data = e.as_ref()?;
-            match metrics_data.clone().decode::<ForwardPassMetrics>() {
-                Ok(stats) => Some(stats),
-                Err(err) => {
-                    tracing::error!(
-                        "Failed to decode ForwardPassMetrics data: {}. Raw data: {:?}",
-                        err,
-                        metrics_data
-                    );
-                    None
-                }
-            }
-        })
-        .collect();
-    tracing::debug!("Metrics: {metrics:?}");
-    metrics
-}
-/// Create ProcessedEndpoints from metrics and endpoints
-pub fn postprocess_metrics(
-    metrics: &[ForwardPassMetrics],
-    endpoints: &[EndpointInfo],
-) -> ProcessedEndpoints {
-    let processed_endpoints: Vec<Endpoint> = metrics
-        .iter()
-        .zip(endpoints.iter())
-        .filter_map(|(m, e)| {
-            e.id().ok().map(|id| Endpoint {
-                name: format!("worker-{id}"),
-                subject: e.subject.clone(),
-                data: LoadMetrics::EngineLoadMetrics(m.clone()),
-            })
-        })
-        .collect();
-    ProcessedEndpoints::new(processed_endpoints)
-}
--- a/components/metrics/src/main.rs
+++ b/components/metrics/src/main.rs
-// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-//! Metrics is a metrics aggregator designed to operate within a namespace and collect
-//! metrics from all workers.
-//!
-//! Metrics will collect for now:
-//!
-//! - LLM Worker Load:Capacity
-//!   - These metrics will be scraped by the LLM NATS Service API's stats request
-//!   - Request Slots: [Active, Total]
-//!   - KV Cache Blocks: [Active, Total]
-//! - KV Hit Rate:
-//!   - These metrics will be collected from KV hit rate events published by the KV router
-//!   - ISL Blocks: Cumulative count of total blocks in all KV hit rate events
-//!   - Overlap Blocks: Cumulative count of blocks that were already in the KV cache
-use clap::Parser;
-use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
-use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
-use dynamo_runtime::{
-    DistributedRuntime, ErrorContext, Result, Runtime, Worker, error, logging,
-    traits::events::{EventPublisher, EventSubscriber},
-    utils::{Duration, Instant},
-};
-use futures::stream::StreamExt;
-use std::sync::Arc;
-// Import from our library
-use metrics::{
-    LLMWorkerLoadCapacityConfig, MetricsMode, PrometheusMetricsCollector, collect_endpoints,
-    extract_metrics, postprocess_metrics,
-};
-/// CLI arguments for the metrics application
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-struct Args {
-    /// Namespace to operate in and subscribe to events on
-    #[arg(long, env = "DYN_NAMESPACE", default_value = "dynamo")]
-    namespace: String,
-    /// Component to scrape metrics from
-    #[arg(long)]
-    component: String,
-    /// Endpoint to scrape metrics from
-    #[arg(long)]
-    endpoint: String,
-    /// Model name for the target component (optional)
-    #[arg(long)]
-    model_name: Option<String>,
-    /// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second)
-    #[arg(long, default_value = "1")]
-    poll_interval: u64,
-    /// Host for serving or pushing prometheus metrics (default: 0.0.0.0)
-    #[arg(
-        long,
-        default_value = "0.0.0.0",
-        help_heading = "Prometheus Metrics Config"
-    )]
-    host: String,
-    /// Port to run the Prometheus metrics server on (default: 9091)
-    #[arg(
-        long,
-        default_value = "9091",
-        help_heading = "Prometheus Metrics Config"
-    )]
-    port: u16,
-    /// Push metrics to an external Prometheus Pushgateway instead of hosting them in-process
-    #[arg(long, help_heading = "Prometheus Metrics Config")]
-    push: bool,
-    /// Push interval in seconds, when using push mode (minimum 1 second, default: 2)
-    #[arg(long, default_value = "2", help_heading = "Prometheus Metrics Config")]
-    push_interval: u64,
-}
-fn get_config(args: &Args) -> Result<LLMWorkerLoadCapacityConfig> {
-    if args.component.is_empty() {
-        return Err(error!("Component name cannot be empty"));
-    }
-    if args.endpoint.is_empty() {
-        return Err(error!("Endpoint name cannot be empty"));
-    }
-    if args.poll_interval < 1 {
-        return Err(error!("Polling interval must be at least 1 second"));
-    }
-    if args.push && args.push_interval < 1 {
-        return Err(error!("Push interval must be at least 1 second"));
-    }
-    Ok(LLMWorkerLoadCapacityConfig {
-        component_name: args.component.clone(),
-        endpoint_name: args.endpoint.clone(),
-        model_name: args.model_name.clone(),
-    })
-}
-async fn app(runtime: Runtime) -> Result<()> {
-    let args = Args::parse();
-    let config = get_config(&args)?;
-    tracing::debug!("Config: {config:?}");
-    let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
-    let namespace = drt.namespace(args.namespace)?;
-    let component = namespace.component("count")?;
-    // Create unique instance of Count
-    let key = format!("{}/instance", component.etcd_root());
-    tracing::debug!("Creating unique instance of Count at {key}");
-    drt.etcd_client()
-        .expect("Unreachable because of DistributedRuntime::from_settings above")
-        .kv_create(&key, serde_json::to_vec_pretty(&config)?, None)
-        .await
-        .context("Unable to create unique instance of Count; possibly one already exists")?;
-    let target_component = namespace.component(&config.component_name)?;
-    let target_endpoint = target_component.endpoint(&config.endpoint_name);
-    let service_path = target_endpoint.path();
-    let service_subject = target_endpoint.subject();
-    tracing::info!("Scraping endpoint {service_path} for stats");
-    // Safety: DistributedRuntime::from_settings ensures this is Some
-    let token = drt.primary_lease().unwrap().child_token();
-    let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);
-    // Initialize Prometheus metrics with the selected mode
-    let metrics_collector = PrometheusMetricsCollector::new()?;
-    let metrics_collector = Arc::new(tokio::sync::Mutex::new(metrics_collector));
-    // Start metrics collection in the selected mode
-    let metrics_mode = if args.push {
-        MetricsMode::Push {
-            host: args.host,
-            port: args.port,
-            job: "dynamo_push_metrics".to_string(),
-            interval: args.push_interval,
-        }
-    } else {
-        MetricsMode::Pull {
-            host: args.host,
-            port: args.port,
-        }
-    };
-    metrics_collector.lock().await.start(metrics_mode)?;
-    // TODO: Consider removing event subscription until metrics are more standardized
-    // Subscribe to KV hit rate events
-    let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT;
-    tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
-    // Clone fields for the event subscription task
-    let config_clone = config.clone();
-    let namespace_clone = namespace.clone();
-    let metrics_collector_clone = metrics_collector.clone();
-    // Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production.
-    // Spawn a task to handle KV hit rate events
-    tokio::spawn(async move {
-        match namespace_clone.subscribe(kv_hit_rate_subject).await {
-            Ok(mut subscriber) => {
-                tracing::debug!("Successfully subscribed to KV hit rate events");
-                while let Some(msg) = subscriber.next().await {
-                    match serde_json::from_slice::<KVHitRateEvent>(&msg.payload) {
-                        Ok(event) => {
-                            // TODO: Lower to debug
-                            let cache_hit_pct =
-                                (event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0;
-                            tracing::debug!(
-                                "Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%",
-                                event.worker_id,
-                                event.isl_blocks,
-                                event.overlap_blocks,
-                                cache_hit_pct
-                            );
-                            // Update metrics with the event data
-                            let mut metrics = metrics_collector_clone.lock().await;
-                            metrics.update_kv_hit_rate(
-                                &config_clone,
-                                event.worker_id,
-                                event.isl_blocks,
-                                event.overlap_blocks as usize,
-                            );
-                        }
-                        Err(e) => {
-                            tracing::warn!("Failed to deserialize KV hit rate event: {e}");
-                        }
-                    }
-                }
-                tracing::warn!("KV hit rate event subscription stream ended");
-            }
-            Err(e) => {
-                tracing::error!("Failed to subscribe to KV hit rate events: {:?}", e);
-            }
-        }
-    });
-    loop {
-        let next = Instant::now() + Duration::from_secs(args.poll_interval);
-        // Collect and process metrics
-        let scrape_timeout = Duration::from_secs(1);
-        let endpoints =
-            collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
-        if endpoints.is_empty() {
-            tracing::warn!("No endpoints found matching {service_path}");
-            continue;
-        }
-        let metrics = extract_metrics(&endpoints);
-        let processed = postprocess_metrics(&metrics, &endpoints);
-        if processed.endpoints.is_empty() {
-            tracing::warn!("No metrics found matching {service_path}");
-        } else {
-            tracing::info!("Aggregated metrics: {processed:?}");
-        }
-        // Update Prometheus metrics
-        metrics_collector.lock().await.update(&config, &processed);
-        // TODO: Enable KV Routers to subscribe to metrics events published here
-        // for a single view of the aggregated metrics, as opposed to the current
-        // approach where each KV Router computes and published its own metrics.
-        // Publish metrics event
-        namespace.publish(&event_name, &processed).await?;
-        // Wait until cancelled or the next tick
-        match tokio::time::timeout_at(next, token.cancelled()).await {
-            Ok(_) => break,
-            Err(_) => continue,
-        }
-    }
-    Ok(())
-}
-fn main() -> Result<()> {
-    logging::init();
-    let worker = Worker::from_settings()?;
-    worker.execute(app)
-}
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use std::env;
-    #[test]
-    fn test_namespace_from_env() {
-        unsafe { env::set_var("DYN_NAMESPACE", "test-namespace") };
-        let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
-        assert_eq!(args.namespace, "test-namespace");
-    }
-}
--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -174,7 +174,7 @@ The following configuration files should be present in this directory:
 - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
 - [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
 - [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development.
+- [grafana_dashboards/grafana-kvbm-dashboard.json](./grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
 ### Metric Name Constants
@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr
   - DCGM Exporter: `http://localhost:9401/metrics`
-   - Start the [components/metrics](../../components/metrics/README.md) application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`.
-   - Uncomment the appropriate lines in prometheus.yml to poll port 9091.
   - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics.
 ### Configuration
@@ -275,7 +273,7 @@ Grafana is pre-configured with:
  docker compose logs grafana
  ```
-3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps.
+3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
 ## Developer Guide
@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter(
 )?;
 ```
-## Running the deprecated `components/metrics` program
-⚠️ **DEPRECATION NOTICE** ⚠️
-When you run the example [components/metrics](../../components/metrics/README.md) program, it exposes a Prometheus /metrics endpoint with the following metrics (defined in [components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)):
-**⚠️ The following `llm_kv_*` metrics are deprecated:**
- `llm_requests_active_slots`: Active request slots per worker
- `llm_requests_total_slots`: Total available request slots per worker
- `llm_kv_blocks_active`: Active KV blocks per worker
- `llm_kv_blocks_total`: Total KV blocks available per worker
- `llm_kv_hit_rate_percent`: KV Cache hit percent per worker
- `llm_load_avg`: Average load across workers
- `llm_load_std`: Load standard deviation across workers
 ## Troubleshooting
@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md
  docker compose logs grafana
  ```
-3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps.
+3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
--- a/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json
+++ b/deploy/metrics/grafana_dashboards/grafana-llm-metrics.json
-{
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": {
-          "type": "grafana",
-          "uid": "-- Grafana --"
-        },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "type": "dashboard"
-      }
-    ]
-  },
-  "copyright": [
-    "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
-    "SPDX-License-Identifier: Apache-2.0",
-    "Licensed under the Apache License, Version 2.0 (the \"License\");",
-    "you may not use this file except in compliance with the License.",
-    "You may obtain a copy of the License at",
-    "http://www.apache.org/licenses/LICENSE-2.0",
-    "Unless required by applicable law or agreed to in writing, software",
-    "distributed under the License is distributed on an \"AS IS\" BASIS,",
-    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
-    "See the License for the specific language governing permissions and",
-    "limitations under the License.",
-    "",
-    "DEPRECATION NOTICE:",
-    "This dashboard uses deprecated llm_kv_* metrics (llm_kv_blocks_active, llm_kv_blocks_total, llm_kv_hit_rate_percent)",
-    "that are part of the deprecated metrics aggregation service. These metrics will be removed in a future release.",
-    "Please migrate to the new MetricsRegistry system which provides dynamo_* metrics instead.",
-    "See docs/guides/metrics.md for migration guidance."
-  ],
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 0,
-  "id": 1,
-  "links": [],
-  "liveNow": false,
-  "panels": [
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 0
-      },
-      "id": 1,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "KV Cache Utilization by Worker",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"}",
-          "legendFormat": "Worker {{worker_id}}",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 0
-      },
-      "id": 2,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "Request Slot Utilization by Worker",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"} / llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"}",
-          "legendFormat": "Worker {{worker_id}}",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 4,
-        "x": 0,
-        "y": 8
-      },
-      "id": 3,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true
-      },
-      "pluginVersion": "10.0.0",
-      "title": "Average KV Cache Utilization",
-      "type": "gauge",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * avg(llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 4,
-        "x": 4,
-        "y": 8
-      },
-      "id": 4,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true
-      },
-      "pluginVersion": "10.0.0",
-      "title": "Average Request Slot Utilization",
-      "type": "gauge",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * avg(llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "thresholds"
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 4,
-        "x": 8,
-        "y": 8
-      },
-      "id": 7,
-      "options": {
-        "orientation": "auto",
-        "reduceOptions": {
-          "calcs": [
-            "lastNotNull"
-          ],
-          "fields": "",
-          "values": false
-        },
-        "showThresholdLabels": false,
-        "showThresholdMarkers": true
-      },
-      "pluginVersion": "10.0.0",
-      "title": "Average KV Cache Hit Rate",
-      "type": "gauge",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * avg(llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 8
-      },
-      "id": 5,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "Load Average & Standard Deviation",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "llm_load_avg{component=\"$component\", endpoint=\"$endpoint\"}",
-          "legendFormat": "Average",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "llm_load_std{component=\"$component\", endpoint=\"$endpoint\"}",
-          "hide": false,
-          "legendFormat": "StdDev",
-          "range": true,
-          "refId": "B"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 16
-      },
-      "id": 8,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "KV Cache Hit Rate by Worker",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"}",
-          "legendFormat": "Worker {{worker_id}}",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "percent"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 16
-      },
-      "id": 9,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "Average KV Cache Hit Rate",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "avg(100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "Average Hit Rate",
-          "range": true,
-          "refId": "A"
-        }
-      ]
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "prometheus"
-      },
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 20,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "lineInterpolation": "smooth",
-            "lineWidth": 2,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "never",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 24,
-        "x": 0,
-        "y": 24
-      },
-      "id": 6,
-      "options": {
-        "legend": {
-          "calcs": [
-            "mean",
-            "max"
-          ],
-          "displayMode": "table",
-          "placement": "right",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "multi",
-          "sort": "none"
-        }
-      },
-      "title": "Available Resources",
-      "type": "timeseries",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "sum(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"} - llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "Available KV Blocks",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "editorMode": "code",
-          "expr": "sum(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"} - llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"})",
-          "hide": false,
-          "legendFormat": "Available Request Slots",
-          "range": true,
-          "refId": "B"
-        }
-      ]
-    }
-  ],
-  "refresh": "2s",
-  "schemaVersion": 38,
-  "style": "dark",
-  "tags": [
-    "llm",
-    "metrics"
-  ],
-  "templating": {
-    "list": [
-      {
-        "current": {
-          "selected": false,
-          "text": "component",
-          "value": "vllm"
-        },
-        "datasource": {
-          "type": "prometheus",
-          "uid": "prometheus"
-        },
-        "definition": "label_values(llm_kv_blocks_active, component)",
-        "hide": 0,
-        "includeAll": false,
-        "label": "Component",
-        "multi": false,
-        "name": "component",
-        "options": [],
-        "query": {
-          "query": "label_values(llm_kv_blocks_active, component)",
-          "refId": "StandardVariableQuery"
-        },
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "sort": 0,
-        "type": "query"
-      },
-      {
-        "current": {
-          "selected": false,
-          "text": "endpoint",
-          "value": "load_metrics"
-        },
-        "datasource": {
-          "type": "prometheus",
-          "uid": "prometheus"
-        },
-        "definition": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
-        "hide": 0,
-        "includeAll": false,
-        "label": "Endpoint",
-        "multi": false,
-        "name": "endpoint",
-        "options": [],
-        "query": {
-          "query": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
-          "refId": "StandardVariableQuery"
-        },
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "sort": 0,
-        "type": "query"
-      }
-    ]
-  },
-  "time": {
-    "from": "now-5m",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "",
-  "title": "LLM Worker Metrics",
-  "uid": "llm-worker-metrics",
-  "version": 1,
-  "weekStart": ""
-}
\ No newline at end of file
--- a/deploy/metrics/prometheus.yml
+++ b/deploy/metrics/prometheus.yml
@@ -33,13 +33,13 @@ scrape_configs:
    static_configs:
      - targets: ['dcgm-exporter:9401']  # on the "monitoring" network
-  # This is a demo service that needs to be launched manually. See components/metrics/README.md
+  # This is a demo service that needs to be launched manually
-  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp
+  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp
-  # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080
+  # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8000
  - job_name: 'dynamo-frontend'
    scrape_interval: 10s
    static_configs:
-      - targets: ['host.docker.internal:8080']  # on the "monitoring" network
+      - targets: ['host.docker.internal:8000']  # on the "monitoring" network
  # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ...
  # If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY
@@ -48,15 +48,6 @@ scrape_configs:
    static_configs:
      - targets: ['host.docker.internal:8081']
-  # DEPRECATED: This metrics aggregation service is being deprecated in favor of MetricsRegistry
-  # The new system uses the 'dynamo-backend' job above instead of this separate service
-  # This is another demo aggregator that needs to be launched manually. See components/metrics/README.md
-  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp
-  - job_name: 'metrics-aggregation-service'
-    scrape_interval: 2s
-    static_configs:
-      # - targets: ['localhost:9091']  # metrics aggregation service on host
-      - targets: ['host.docker.internal:9091']  # metrics aggregation service on host
  # KVBM leader related metrics
  - job_name: 'kvbm-metrics'