feat: Add per-worker Prometheus metrics for router load monitoring (#5842)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: Add per-worker Prometheus metrics for router load monitoring (#5842)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
0c0336e6 · Hongkuan Zhou · GitHub · eff08aed · 0c0336e6 · 0c0336e6
Unverified Commit 0c0336e6 authored Feb 04, 2026 by Hongkuan Zhou Committed by GitHub Feb 04, 2026
20 changed files
--- a/examples/hierarchical_planner/run_example.sh
+++ b/examples/hierarchical_planner/run_example.sh
@@ -27,8 +27,8 @@ python -m dynamo.global_router \
 # ============================================================================
 DYN_NAMESPACE=prefill_pool_0 python -m dynamo.router \
  --endpoint prefill_pool_0.worker.generate \
-  --block-size 16 & \
+  --block-size 16 \
-  --no-track-active-blocks # prefill router does not need to track active blocks
+  --no-track-active-blocks &  # prefill router does not need to track active blocks
 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
@@ -41,8 +41,8 @@ python -m dynamo.mocker \
 # ============================================================================
 DYN_NAMESPACE=prefill_pool_1 python -m dynamo.router \
  --endpoint prefill_pool_1.worker.generate \
-  --block-size 16 & \
+  --block-size 16 \
-  --no-track-active-blocks # prefill router does not need to track active blocks
+  --no-track-active-blocks &  # prefill router does not need to track active blocks
 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \
@@ -55,8 +55,8 @@ python -m dynamo.mocker \
 # ============================================================================
 DYN_NAMESPACE=decode_pool_0 python -m dynamo.router \
  --endpoint decode_pool_0.worker.generate \
-  --block-size 16 & \
+  --block-size 16 \
-  --kv-overlap-score-weight 0
+  --kv-overlap-score-weight 0 &
 python -m dynamo.mocker \
  --model-path Qwen/Qwen3-0.6B \

--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -1335,6 +1335,7 @@ pub async fn create_worker_selection_pipeline_chat(
    >,
    Option<Arc<dynamo_llm::kv_router::KvRouter>>,
 )> {
+    use dynamo_llm::discovery::WORKER_TYPE_DECODE;
    use dynamo_llm::kv_router::PrefillRouter;
    // Use the global DRT singleton - initialize if not already done
@@ -1401,7 +1402,12 @@ pub async fn create_worker_selection_pipeline_chat(
    let chooser = if router_mode == RouterMode::KV {
        Some(
            model_manager
-                .kv_chooser_for(&endpoint, card.kv_cache_block_size, kv_router_config)
+                .kv_chooser_for(
+                    &endpoint,
+                    card.kv_cache_block_size,
+                    kv_router_config,
+                    WORKER_TYPE_DECODE,
+                )
                .await?,
        )
    } else {
@@ -1425,6 +1431,7 @@ pub async fn create_worker_selection_pipeline_chat(
                card.kv_cache_block_size,
                Some(prefill_config),
                enforce_disagg,
+                model_name.to_string(),
            )
        });

--- a/lib/bindings/python/rust/llm/kv.rs
+++ b/lib/bindings/python/rust/llm/kv.rs
@@ -991,7 +991,11 @@ impl KvRecorder {
 }
 /// Helper function to create a KV router from an endpoint using the ModelManager
-/// to ensure proper etcd registration
+/// to ensure proper etcd registration.
+/// Infers worker type using endpoint naming and router config:
+/// - If endpoint name/component contains "prefill", treat as prefill
+/// - If router_track_active_blocks is disabled, treat as prefill
+/// - Otherwise, default to decode
 async fn create_kv_router_from_endpoint(
    endpoint: &Endpoint,
    block_size: usize,
@@ -999,8 +1003,28 @@ async fn create_kv_router_from_endpoint(
 ) -> Result<Arc<llm_rs::kv_router::KvRouter>, PyErr> {
    // Create ModelManager and use it to create KvRouter (ensures registration)
    let model_manager = Arc::new(llm_rs::discovery::ModelManager::new());
+    let endpoint_id = endpoint.inner.id();
+    let namespace = endpoint_id.namespace.to_lowercase();
+    let component = endpoint_id.component.to_lowercase();
+    let name = endpoint_id.name.to_lowercase();
+    let endpoint_is_prefill =
+        namespace.contains("prefill") || component.contains("prefill") || name.contains("prefill");
+    let track_active_blocks = kv_router_config
+        .as_ref()
+        .map(|cfg| cfg.router_track_active_blocks)
+        .unwrap_or(true);
+    let worker_type = if endpoint_is_prefill || !track_active_blocks {
+        llm_rs::discovery::WORKER_TYPE_PREFILL
+    } else {
+        llm_rs::discovery::WORKER_TYPE_DECODE
+    };
    let kv_router = model_manager
-        .kv_chooser_for(&endpoint.inner, block_size as u32, kv_router_config)
+        .kv_chooser_for(
+            &endpoint.inner,
+            block_size as u32,
+            kv_router_config,
+            worker_type,
+        )
        .await
        .map_err(to_pyerr)?;
@@ -1096,7 +1120,17 @@ impl KvPushRouter {
 #[pymethods]
 impl KvPushRouter {
+    /// Create a new KvPushRouter for KV-aware routing to workers.
+    ///
+    /// # Arguments
+    /// * `endpoint` - The endpoint to route requests to
+    /// * `block_size` - KV cache block size for routing decisions
+    /// * `kv_router_config` - Configuration for the KV router
+    ///
+    /// Note: Worker type for Prometheus metrics is inferred from the endpoint name/component
+    /// (contains "prefill") or by `router_track_active_blocks` being disabled.
    #[new]
+    #[pyo3(signature = (endpoint, block_size, kv_router_config))]
    fn new(
        endpoint: &Endpoint,
        block_size: usize,

--- a/lib/bindings/python/src/dynamo/prometheus_names.py
+++ b/lib/bindings/python/src/dynamo/prometheus_names.py
@@ -80,6 +80,22 @@ class frontend_service:
    MODEL_MIGRATION_LIMIT = "model_migration_limit"
    # Total number of request migrations due to worker unavailability
    MODEL_MIGRATION_TOTAL = "model_migration_total"
+    # Active decode blocks (KV cache blocks) per worker
+    # Gauge metric tracking current KV cache block utilization for each worker
+    WORKER_ACTIVE_DECODE_BLOCKS = "worker_active_decode_blocks"
+    # Active prefill tokens per worker
+    # Gauge metric tracking current queued prefill tokens for each worker
+    WORKER_ACTIVE_PREFILL_TOKENS = "worker_active_prefill_tokens"
+    # Last observed time to first token per worker (in seconds)
+    # Gauge metric tracking the most recent TTFT for each worker
+    WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS = "worker_last_time_to_first_token_seconds"
+    # Last observed input sequence tokens per worker
+    # Gauge metric tracking the input token count from the same request as WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS
+    # Updated atomically with TTFT to correlate latency with input size
+    WORKER_LAST_INPUT_SEQUENCE_TOKENS = "worker_last_input_sequence_tokens"
+    # Last observed inter-token latency per worker (in seconds)
+    # Gauge metric tracking the most recent ITL for each worker
+    WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS = "worker_last_inter_token_latency_seconds"
    # Label name for the type of migration
    MIGRATION_TYPE_LABEL = "migration_type"

--- a/lib/llm/src/discovery.rs
+++ b/lib/llm/src/discovery.rs
@@ -11,4 +11,8 @@ mod watcher;
 pub use watcher::{ModelUpdate, ModelWatcher};
 mod worker_monitor;
-pub use worker_monitor::{KvWorkerMonitor, LoadThresholdConfig, WorkerLoadState};
+pub use worker_monitor::{
+    KvWorkerMonitor, LoadThresholdConfig, WORKER_ACTIVE_DECODE_BLOCKS_GAUGE,
+    WORKER_ACTIVE_PREFILL_TOKENS_GAUGE, WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL, WorkerLoadState,
+    register_worker_load_metrics,
+};
--- a/lib/llm/src/discovery/model_manager.rs
+++ b/lib/llm/src/discovery/model_manager.rs
@@ -354,6 +354,7 @@ impl ModelManager {
        endpoint: &Endpoint,
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
+        worker_type: &'static str,
    ) -> anyhow::Result<Arc<KvRouter>> {
        let endpoint_id = endpoint.id();
@@ -403,6 +404,7 @@ impl ModelManager {
            Some(selector),
            kv_router_config,
            instance_id,
+            worker_type,
        )
        .await?;
        let new_kv_chooser = Arc::new(chooser);
@@ -538,6 +540,11 @@ impl ModelManager {
        Some(monitor.load_threshold_config())
    }
+    /// Gets an existing worker monitor for a model, if one exists.
+    pub fn get_worker_monitor(&self, model: &str) -> Option<KvWorkerMonitor> {
+        self.worker_monitors.get(model).map(|m| m.clone())
+    }
    /// Gets or creates a worker monitor for a model. Updates thresholds if monitor exists.
    pub fn get_or_create_worker_monitor(
        &self,

--- a/lib/llm/src/discovery/watcher.rs
+++ b/lib/llm/src/discovery/watcher.rs
@@ -24,6 +24,7 @@ use dynamo_runtime::{
 use crate::{
    backend::Backend,
+    discovery::WORKER_TYPE_DECODE,
    entrypoint::{self, EngineFactoryCallback, RouterConfig},
    http::service::metrics::Metrics,
    kv_router::PrefillRouter,
@@ -429,6 +430,7 @@ impl ModelWatcher {
                            &endpoint,
                            card.kv_cache_block_size,
                            Some(self.router_config.kv_router_config),
+                            WORKER_TYPE_DECODE, // This is the decode router
                        )
                        .await?,
                )
@@ -441,9 +443,10 @@ impl ModelWatcher {
            // Create prefill chooser once if we're building pipelines
            // Both chat and completions will share the same prefill chooser instance
+            let model_name = card.name().to_string();
            let prefill_chooser = self
                .manager
-                .register_prefill_router(card.name().to_string())
+                .register_prefill_router(model_name.clone())
                .map(|rx| {
                    // Create prefill-specific config with track_active_blocks disabled
                    let mut prefill_config = self.router_config.kv_router_config;
@@ -456,21 +459,19 @@ impl ModelWatcher {
                        card.kv_cache_block_size,
                        Some(prefill_config),
                        self.router_config.enforce_disagg,
+                        model_name.clone(), // Pass model name for worker monitor lookup
                    )
                });
-            // Get or create the worker monitor for this model
+            // Get or create the worker monitor for this model.
-            // This allows dynamic threshold updates via the ModelManager
+            // Always create the monitor for Prometheus metrics (active_decode_blocks, active_prefill_tokens,
-            // Create monitor if any threshold is configured
+            // worker TTFT/ITL cleanup). The thresholds control busy detection behavior only.
-            let worker_monitor = if self.router_config.load_threshold_config.is_configured() {
+            // LoadThresholdConfig allows dynamic threshold updates via the ModelManager.
-                Some(self.manager.get_or_create_worker_monitor(
+            let worker_monitor = Some(self.manager.get_or_create_worker_monitor(
                card.name(),
                client.clone(),
                self.router_config.load_threshold_config.clone(),
-                ))
+            ));
-            } else {
-                None
-            };
            // Add chat engine only if the model supports chat
            if card.model_type.supports_chat() {

--- a/lib/llm/src/discovery/worker_monitor.rs
+++ b/lib/llm/src/discovery/worker_monitor.rs
--- a/lib/llm/src/entrypoint/input/common.rs
+++ b/lib/llm/src/entrypoint/input/common.rs
@@ -271,6 +271,9 @@ where
    let service_backend = match router_mode {
        RouterMode::Random | RouterMode::RoundRobin | RouterMode::Direct(_) => {
+            // Non-KV routing: use PushRouter directly.
+            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
+            // available in KV routing mode where the router has actual bookkeeping.
            ServiceBackend::from_engine(Arc::new(router))
        }
        RouterMode::KV => {

--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -14,10 +14,12 @@ use dynamo_runtime::{
        frontend_service, name_prefix, sanitize_frontend_prometheus_prefix,
    },
 };
-use prometheus::{Encoder, HistogramOpts, HistogramVec, IntCounterVec, IntGaugeVec, Opts};
+use prometheus::{
+    Encoder, GaugeVec, HistogramOpts, HistogramVec, IntCounterVec, IntGaugeVec, Opts,
+};
 use serde::Serialize;
 use std::{
-    sync::Arc,
+    sync::{Arc, LazyLock},
    time::{Duration, Instant},
 };
@@ -29,6 +31,72 @@ pub use prometheus::Registry;
 use super::RouteDoc;
+/// Worker type label values for Prometheus timing metrics
+pub use crate::discovery::{WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL};
+/// Global Prometheus gauge for last observed TTFT per worker (in seconds)
+/// Labels: worker_id, dp_rank, worker_type
+pub static WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE: LazyLock<GaugeVec> = LazyLock::new(|| {
+    GaugeVec::new(
+        Opts::new(
+            format!(
+                "dynamo_frontend_{}",
+                frontend_service::WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS
+            ),
+            "Last observed time to first token per worker (seconds)",
+        ),
+        &["worker_id", "dp_rank", "worker_type"],
+    )
+    .expect("Failed to create worker_last_time_to_first_token gauge")
+});
+/// Global Prometheus gauge for last observed input sequence tokens per worker
+/// Labels: worker_id, dp_rank, worker_type
+/// Updated atomically with TTFT - represents the input token count from the same request
+pub static WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE: LazyLock<IntGaugeVec> = LazyLock::new(|| {
+    IntGaugeVec::new(
+        Opts::new(
+            format!(
+                "dynamo_frontend_{}",
+                frontend_service::WORKER_LAST_INPUT_SEQUENCE_TOKENS
+            ),
+            "Last observed input sequence tokens per worker",
+        ),
+        &["worker_id", "dp_rank", "worker_type"],
+    )
+    .expect("Failed to create worker_last_input_sequence_tokens gauge")
+});
+/// Global Prometheus gauge for last observed ITL per worker (in seconds)
+/// Labels: worker_id, dp_rank, worker_type
+pub static WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE: LazyLock<GaugeVec> = LazyLock::new(|| {
+    GaugeVec::new(
+        Opts::new(
+            format!(
+                "dynamo_frontend_{}",
+                frontend_service::WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS
+            ),
+            "Last observed inter-token latency per worker (seconds)",
+        ),
+        &["worker_id", "dp_rank", "worker_type"],
+    )
+    .expect("Failed to create worker_last_inter_token_latency gauge")
+});
+/// Register the global per-worker TTFT/ITL/input-tokens Prometheus metrics with the given registry.
+///
+/// This should be called once during HTTP service setup to expose the metrics
+/// via the `/metrics` endpoint.
+///
+/// # Errors
+/// Returns an error if the metrics are already registered with the registry.
+pub fn register_worker_timing_metrics(registry: &Registry) -> Result<(), prometheus::Error> {
+    registry.register(Box::new(WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE.clone()))?;
+    registry.register(Box::new(WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE.clone()))?;
+    registry.register(Box::new(WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE.clone()))?;
+    Ok(())
+}
 /// Generate log-spaced histogram buckets with values rounded to 2 significant figures.
 ///
 /// # Arguments
@@ -259,6 +327,16 @@ pub struct ResponseMetricCollector {
    osl: usize,
    // we track if cached_tokens has been observed to ensure we only increment once per request
    cached_tokens_observed: bool,
+    // Prefill worker info for TTFT attribution (set from LLMMetricAnnotation)
+    prefill_worker_id: Option<u64>,
+    prefill_dp_rank: Option<u32>,
+    // Prefill worker type for Prometheus labeling - stored at routing time to avoid MDC lookup
+    prefill_worker_type: Option<String>,
+    // Decode worker info for ITL attribution (set from LLMMetricAnnotation)
+    decode_worker_id: Option<u64>,
+    decode_dp_rank: Option<u32>,
+    // Decode worker type for Prometheus labeling - stored at routing time to avoid MDC lookup
+    decode_worker_type: Option<String>,
 }
 impl Default for Metrics {
@@ -891,6 +969,44 @@ impl ResponseMetricCollector {
            start_time: Instant::now(),
            osl: 0,
            cached_tokens_observed: false,
+            prefill_worker_id: None,
+            prefill_dp_rank: None,
+            prefill_worker_type: None,
+            decode_worker_id: None,
+            decode_dp_rank: None,
+            decode_worker_type: None,
+        }
+    }
+    /// Set the worker info for per-worker TTFT/ITL metrics.
+    /// In disaggregated mode, TTFT is attributed to prefill worker, ITL to decode worker.
+    /// Worker types are stored at routing time to avoid expensive MDC lookup when updating metrics.
+    pub fn set_worker_info(
+        &mut self,
+        prefill_worker_id: Option<u64>,
+        prefill_dp_rank: Option<u32>,
+        prefill_worker_type: Option<String>,
+        decode_worker_id: Option<u64>,
+        decode_dp_rank: Option<u32>,
+        decode_worker_type: Option<String>,
+    ) {
+        if self.prefill_worker_id.is_none() {
+            self.prefill_worker_id = prefill_worker_id;
+        }
+        if self.prefill_dp_rank.is_none() {
+            self.prefill_dp_rank = prefill_dp_rank;
+        }
+        if self.prefill_worker_type.is_none() {
+            self.prefill_worker_type = prefill_worker_type;
+        }
+        if self.decode_worker_id.is_none() {
+            self.decode_worker_id = decode_worker_id;
+        }
+        if self.decode_dp_rank.is_none() {
+            self.decode_dp_rank = decode_dp_rank;
+        }
+        if self.decode_worker_type.is_none() {
+            self.decode_worker_type = decode_worker_type;
        }
    }
@@ -941,6 +1057,28 @@ impl ResponseMetricCollector {
                .with_label_values(&[&self.model])
                .observe(ttft);
+            // Update per-worker TTFT and input sequence tokens gauges - attributed to prefill worker.
+            // Both gauges are updated atomically from the same request to correlate latency with input size.
+            // Use stored worker_type (from routing time) to avoid MDC lookup.
+            // Falls back to WORKER_TYPE_PREFILL if not available.
+            if let Some(worker_id) = self.prefill_worker_id {
+                let worker_id_str = worker_id.to_string();
+                let dp_rank_str = self
+                    .prefill_dp_rank
+                    .map_or("0".to_string(), |r| r.to_string());
+                let worker_type = self
+                    .prefill_worker_type
+                    .as_deref()
+                    .unwrap_or(WORKER_TYPE_PREFILL);
+                let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type];
+                WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE
+                    .with_label_values(labels)
+                    .set(ttft);
+                WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
+                    .with_label_values(labels)
+                    .set(isl as i64);
+            }
            // Publish ISL
            // TODO: publish ISL as soon as the tokenization process completes
            self.metrics
@@ -960,6 +1098,23 @@ impl ResponseMetricCollector {
                    .with_label_values(&[&self.model])
                    .observe(itl);
            }
+            // Update per-worker ITL gauge - attributed to decode worker.
+            // Use stored worker_type (from routing time) to avoid MDC lookup.
+            // Falls back to WORKER_TYPE_DECODE if not available.
+            if let Some(worker_id) = self.decode_worker_id {
+                let worker_id_str = worker_id.to_string();
+                let dp_rank_str = self
+                    .decode_dp_rank
+                    .map_or("0".to_string(), |r| r.to_string());
+                let worker_type = self
+                    .decode_worker_type
+                    .as_deref()
+                    .unwrap_or(WORKER_TYPE_DECODE);
+                WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
+                    .with_label_values(&[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type])
+                    .set(itl);
+            }
        }
        self.last_response_time = Some(current_duration);
@@ -992,6 +1147,14 @@ pub fn process_response_and_observe_metrics<T>(
    if let Ok(Some(metrics)) = LLMMetricAnnotation::from_annotation(annotated) {
        response_collector.observe_current_osl(metrics.output_tokens);
        response_collector.observe_cached_tokens(metrics.cached_tokens);
+        response_collector.set_worker_info(
+            metrics.prefill_worker_id,
+            metrics.prefill_dp_rank,
+            metrics.prefill_worker_type,
+            metrics.decode_worker_id,
+            metrics.decode_dp_rank,
+            metrics.decode_worker_type,
+        );
        // Drop http_queue_guard on first token for non-streaming (same as streaming)
        if response_collector.is_first_token()
@@ -1033,6 +1196,14 @@ pub fn process_response_using_event_converter_and_observe_metrics<T: Serialize>(
    if let Ok(Some(metrics)) = LLMMetricAnnotation::from_annotation(&annotated) {
        response_collector.observe_current_osl(metrics.output_tokens);
        response_collector.observe_cached_tokens(metrics.cached_tokens);
+        response_collector.set_worker_info(
+            metrics.prefill_worker_id,
+            metrics.prefill_dp_rank,
+            metrics.prefill_worker_type,
+            metrics.decode_worker_id,
+            metrics.decode_dp_rank,
+            metrics.decode_worker_type,
+        );
        // Drop http_queue_guard on first token for streaming
        if response_collector.is_first_token()
@@ -1526,6 +1697,12 @@ mod tests {
            output_tokens: 20,
            chunk_tokens: 5,
            cached_tokens: Some(15),
+            prefill_worker_id: None,
+            prefill_dp_rank: None,
+            prefill_worker_type: None,
+            decode_worker_id: None,
+            decode_dp_rank: None,
+            decode_worker_type: None,
        };
        let annotation = llm_metrics.to_annotation::<()>().unwrap();
@@ -1585,6 +1762,12 @@ mod tests {
            output_tokens: 20,
            chunk_tokens: 5,
            cached_tokens: Some(15),
+            prefill_worker_id: None,
+            prefill_dp_rank: None,
+            prefill_worker_type: None,
+            decode_worker_id: None,
+            decode_dp_rank: None,
+            decode_worker_type: None,
        };
        let annotation = llm_metrics.to_annotation::<()>().unwrap();

--- a/lib/llm/src/http/service/service_v2.rs
+++ b/lib/llm/src/http/service/service_v2.rs
@@ -15,7 +15,8 @@ use axum::http::Response;
 use super::Metrics;
 use super::RouteDoc;
 use super::metrics;
-use crate::discovery::ModelManager;
+use super::metrics::register_worker_timing_metrics;
+use crate::discovery::{ModelManager, register_worker_load_metrics};
 use crate::endpoint_type::EndpointType;
 use crate::request_template::RequestTemplate;
 use anyhow::Result;
@@ -392,6 +393,18 @@ impl HttpServiceConfigBuilder {
        let registry = metrics::Registry::new();
        state.metrics_clone().register(&registry)?;
+        // Register worker load metrics (active_decode_blocks, active_prefill_tokens per worker)
+        // These are updated by KvWorkerMonitor when receiving ActiveLoad events
+        if let Err(e) = register_worker_load_metrics(&registry) {
+            tracing::warn!("Failed to register worker load metrics: {}", e);
+        }
+        // Register worker timing metrics (last_ttft, last_itl per worker)
+        // These are updated by ResponseMetricCollector when observing TTFT/ITL
+        if let Err(e) = register_worker_timing_metrics(&registry) {
+            tracing::warn!("Failed to register worker timing metrics: {}", e);
+        }
        // DEPRECATED: To be removed after custom backends migrate to Dynamo backend.
        // Setup custom backend metrics if configured
        let custom_backend_registry =

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -333,6 +333,7 @@ pub struct KvRouter {
 }
 impl KvRouter {
+    #[allow(clippy::too_many_arguments)]
    pub async fn new(
        endpoint: Endpoint,
        client: Client,
@@ -341,6 +342,7 @@ impl KvRouter {
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
        kv_router_config: Option<KvRouterConfig>,
        router_id: u64,
+        worker_type: &'static str,
    ) -> Result<Self> {
        let kv_router_config = kv_router_config.unwrap_or_default();
        let component = endpoint.component();
@@ -382,6 +384,7 @@ impl KvRouter {
            selector,
            kv_router_config.router_replica_sync,
            router_id,
+            worker_type,
        )
        .await?;
@@ -581,6 +584,12 @@ impl KvRouter {
        self.scheduler.free(request_id).await
    }
+    /// Get the worker type for this router ("prefill" or "decode").
+    /// Used for Prometheus metric labeling.
+    pub fn worker_type(&self) -> &'static str {
+        self.scheduler.worker_type()
+    }
    pub async fn add_output_block(
        &self,
        request_id: &str,
@@ -926,11 +935,13 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
            }
        }
-        // Record metrics in tracker: KV hit rate and worker ID based on phase
+        // Record metrics in tracker: KV hit rate, worker ID, and worker type based on phase.
+        // Worker type is stored at routing time to avoid expensive MDC lookups when
+        // updating Prometheus metrics (TTFT/ITL) later in the response stream.
        if let Some(ref tracker) = request.tracker {
            let isl_blocks = request.token_ids.len().div_ceil(block_size);
            tracker.record_kv_hit(overlap_amount, isl_blocks);
-            tracker.record_worker(instance_id);
+            tracker.record_worker_full(instance_id, dp_rank, self.chooser.worker_type());
        }
        // Handle query-only requests: early return with worker info

--- a/lib/llm/src/kv_router/prefill_router.rs
+++ b/lib/llm/src/kv_router/prefill_router.rs
@@ -24,7 +24,7 @@ use crate::{
    kv_router::{KvPushRouter, KvRouterConfig, RouterConfigOverride},
    protocols::common::llm_backend::{LLMEngineOutput, PreprocessedRequest},
    protocols::common::preprocessor::{BootstrapInfo, PrefillResult},
-    protocols::common::timing::{RequestPhase, RequestTracker},
+    protocols::common::timing::{RequestPhase, RequestTracker, WORKER_TYPE_PREFILL},
 };
 /// Errors that can occur during prefill routing
@@ -50,6 +50,8 @@ enum InnerPrefillRouter {
    /// KV-aware routing using KvPushRouter
    KvRouter(Arc<KvPushRouter>),
    /// Simple routing (RoundRobin, Random, Direct)
+    /// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
+    /// available in KV routing mode where the router has actual bookkeeping.
    SimpleRouter(Arc<PushRouter<PreprocessedRequest, Annotated<LLMEngineOutput>>>),
 }
@@ -104,6 +106,8 @@ pub struct PrefillRouter {
    cancel_token: CancellationToken,
    router_mode: RouterMode,
    enforce_disagg: bool,
+    /// Model name used to look up the worker monitor for prefill client registration
+    model_name: String,
 }
 impl PrefillRouter {
@@ -120,6 +124,7 @@ impl PrefillRouter {
            cancel_token: CancellationToken::new(),
            router_mode,
            enforce_disagg,
+            model_name: String::new(), // Not used for disabled router
        })
    }
@@ -130,6 +135,7 @@ impl PrefillRouter {
        kv_cache_block_size: u32,
        kv_router_config: Option<KvRouterConfig>,
        enforce_disagg: bool,
+        model_name: String,
    ) -> Arc<Self> {
        let prefill_router = OnceLock::new();
        let cancel_token = CancellationToken::new();
@@ -141,6 +147,7 @@ impl PrefillRouter {
            cancel_token: cancel_token.clone(),
            router_mode,
            enforce_disagg,
+            model_name,
        });
        // Spawn background task to wait for activation
@@ -194,14 +201,24 @@ impl PrefillRouter {
            .await?;
        let inner_router = if self.router_mode.is_kv_routing() {
-            // Create KV chooser using the endpoint
+            // Create KV chooser using the endpoint (this is a prefill router)
            let kv_chooser = model_manager
-                .kv_chooser_for(&endpoint, kv_cache_block_size, kv_router_config)
+                .kv_chooser_for(
+                    &endpoint,
+                    kv_cache_block_size,
+                    kv_router_config,
+                    WORKER_TYPE_PREFILL,
+                )
                .await?;
            // Extract client from kv_chooser to ensure shared state
            let client = kv_chooser.client().clone();
+            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
+            if let Some(monitor) = model_manager.get_worker_monitor(&self.model_name) {
+                monitor.set_prefill_client(client.clone());
+            }
            // Build the PushRouter for prefill with KV mode using the shared client
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
@@ -217,7 +234,14 @@ impl PrefillRouter {
            // Create client for simple router
            let client = endpoint.client().await?;
+            // Register prefill client with worker monitor for TTFT metric cleanup in disaggregated mode
+            if let Some(monitor) = model_manager.get_worker_monitor(&self.model_name) {
+                monitor.set_prefill_client(client.clone());
+            }
            // Create simple push router with the frontend's router mode
+            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
+            // available in KV routing mode where the router has actual bookkeeping.
            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
                client,
                self.router_mode,
@@ -325,12 +349,14 @@ impl PrefillRouter {
    /// If `phase_permit` is provided, it is dropped after the first output is received,
    /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
    /// optimization path to ensure `record_worker` completes before the phase changes.
+    ///
+    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
    async fn execute_prefill(
        router: Option<InnerPrefillRouter>,
        request: SingleIn<PreprocessedRequest>,
        target_worker: Option<u64>,
        phase_permit: Option<OwnedSemaphorePermit>,
-    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
+    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
        let router = router.ok_or(PrefillError::NotActivated)?;
        let mut prefill_response = router
            .generate_to_worker(request, target_worker)
@@ -382,20 +408,27 @@ impl PrefillRouter {
            ));
        };
-        // Extract prefill worker ID from disaggregated_params
+        // Extract prefill worker ID and dp_rank from disaggregated_params
-        let prefill_worker_id = disaggregated_params
+        let prefill_worker_info =
+            disaggregated_params
                .get("worker_id")
                .and_then(|worker_id_json| {
-                worker_id_json
+                    let worker_id = worker_id_json
                        .get("prefill_worker_id")
+                        .and_then(|v| v.as_u64())?;
+                    let dp_rank = worker_id_json
+                        .get("prefill_dp_rank")
                        .and_then(|v| v.as_u64())
+                        .map(|r| r as u32)
+                        .unwrap_or(0);
+                    Some((worker_id, dp_rank))
                });
        Ok((
            PrefillResult {
                disaggregated_params,
                prompt_tokens_details,
            },
-            prefill_worker_id,
+            prefill_worker_info,
        ))
    }
@@ -437,14 +470,16 @@ impl PrefillRouter {
        );
    }
-    /// Call the prefill router and extract structured prefill result and worker ID.
+    /// Call the prefill router and extract structured prefill result, worker ID, and dp_rank.
    ///
    /// This is the synchronous prefill path - we wait for prefill to complete before proceeding.
    /// No phase permit is needed since `record_worker` completes before we return.
+    ///
+    /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
    async fn call_prefill(
        &self,
        request: SingleIn<PreprocessedRequest>,
-    ) -> Result<(PrefillResult, Option<u64>), PrefillError> {
+    ) -> Result<(PrefillResult, Option<(u64, u32)>), PrefillError> {
        // For call_prefill path, routing is handled by the router itself (no direct routing needed)
        // No phase permit needed - we wait for completion before changing phase
        Self::execute_prefill(self.prefill_router.get().cloned(), request, None, None).await
@@ -522,6 +557,14 @@ impl
                    router.select_next_worker();
                }
+                // Record prefill worker on the main request's tracker for metrics.
+                // (The cloned prefill_req has its own tracker, so we need to record here)
+                // Worker type is stored at routing time to avoid expensive MDC lookups when
+                // updating Prometheus TTFT metrics later in the response stream.
+                if let Some(ref tracker) = req.tracker {
+                    tracker.record_prefill_worker_full(worker_id, dp_rank, WORKER_TYPE_PREFILL);
+                }
                let routing = prefill_req.routing_mut();
                routing.prefill_worker_id = Some(worker_id);
                routing.dp_rank = Some(dp_rank);
@@ -546,9 +589,21 @@ impl
                let prefill_context = Context::with_id(prefill_req, request_id.clone());
                engine_ctx.link_child(prefill_context.context());
-                self.call_prefill(prefill_context)
+                let result = self.call_prefill(prefill_context).await;
-                    .await
-                    .map(|(result, worker_id)| (Some(result), worker_id, None))
+                // Record prefill worker on the main request's tracker for metrics.
+                // (call_prefill returns the worker_id and dp_rank from the prefill routing)
+                // Worker type is stored at routing time to avoid expensive MDC lookups when
+                // updating Prometheus TTFT metrics later in the response stream.
+                if let Ok((_, Some((worker_id, dp_rank)))) = &result
+                    && let Some(ref tracker) = req.tracker
+                {
+                    tracker.record_prefill_worker_full(*worker_id, *dp_rank, WORKER_TYPE_PREFILL);
+                }
+                result.map(|(result, worker_info)| {
+                    (Some(result), worker_info.map(|(id, _)| id), None)
+                })
            }
        }
        .instrument(tracing::info_span!("prefill_routing"))

--- a/lib/llm/src/kv_router/scheduler.rs
+++ b/lib/llm/src/kv_router/scheduler.rs
@@ -101,6 +101,7 @@ impl KvScheduler {
        selector: Option<Box<dyn WorkerSelector + Send + Sync>>,
        replica_sync: bool,
        router_id: u64,
+        worker_type: &'static str,
    ) -> Result<Self, KvSchedulerError> {
        let selector = selector.unwrap_or(Box::new(DefaultWorkerSelector::default()));
@@ -119,6 +120,7 @@ impl KvScheduler {
                initial_workers,
                replica_sync,
                router_id,
+                worker_type,
            )
            .await
            .map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?,
@@ -345,6 +347,12 @@ impl KvScheduler {
        self.slots.free(&request_id.to_string()).await
    }
+    /// Get the worker type for this scheduler ("prefill" or "decode").
+    /// Used for Prometheus metric labeling.
+    pub fn worker_type(&self) -> &'static str {
+        self.slots.worker_type()
+    }
    pub async fn add_output_block(
        &self,
        request_id: &str,

--- a/lib/llm/src/kv_router/sequence.rs
+++ b/lib/llm/src/kv_router/sequence.rs
@@ -40,6 +40,7 @@ use uuid::Uuid;
 use super::protocols::{
    ActiveLoad, ActiveSequenceEvent, ActiveSequenceEventData, WorkerWithDpRank,
 };
+use crate::discovery::{WORKER_ACTIVE_DECODE_BLOCKS_GAUGE, WORKER_ACTIVE_PREFILL_TOKENS_GAUGE};
 use crate::kv_router::{ACTIVE_SEQUENCES_SUBJECT, KV_METRICS_SUBJECT};
 use crate::local_model::runtime_config::ModelRuntimeConfig;
 use dynamo_runtime::CancellationToken;
@@ -415,6 +416,8 @@ pub struct ActiveSequencesMultiWorker {
    /// Publisher for metrics (namespace-scoped)
    metrics_publisher: EventPublisher,
    replica_sync: bool,
+    /// Worker type for Prometheus metrics labeling ("prefill" or "decode")
+    worker_type: &'static str,
 }
 impl ActiveSequencesMultiWorker {
@@ -424,6 +427,7 @@ impl ActiveSequencesMultiWorker {
        workers_with_configs: HashMap<u64, Option<ModelRuntimeConfig>>,
        replica_sync: bool,
        router_id: u64,
+        worker_type: &'static str,
    ) -> Result<Self> {
        assert!(block_size > 1, "block_size must be greater than 1");
@@ -462,6 +466,7 @@ impl ActiveSequencesMultiWorker {
            metrics_publisher,
            router_id,
            replica_sync,
+            worker_type,
        };
        // Start the subscription loop only if replica_sync is enabled
@@ -1045,7 +1050,25 @@ impl ActiveSequencesMultiWorker {
            }
        };
-        // Publish ActiveLoad
+        // Update Prometheus gauges directly (router's own bookkeeping)
+        let worker_id_str = worker.worker_id.to_string();
+        let dp_rank_str = worker.dp_rank.to_string();
+        WORKER_ACTIVE_DECODE_BLOCKS_GAUGE
+            .with_label_values(&[
+                worker_id_str.as_str(),
+                dp_rank_str.as_str(),
+                self.worker_type,
+            ])
+            .set(active_blocks as i64);
+        WORKER_ACTIVE_PREFILL_TOKENS_GAUGE
+            .with_label_values(&[
+                worker_id_str.as_str(),
+                dp_rank_str.as_str(),
+                self.worker_type,
+            ])
+            .set(active_tokens as i64);
+        // Also publish ActiveLoad to NATS for other subscribers (if NATS is available)
        let active_load = ActiveLoad {
            worker_id: worker.worker_id,
            dp_rank: worker.dp_rank,
@@ -1054,7 +1077,8 @@ impl ActiveSequencesMultiWorker {
        };
        if let Err(e) = self.metrics_publisher.publish(&active_load).await {
-            tracing::warn!("Failed to publish ActiveLoad for worker {worker:?}: {e:?}");
+            // This is expected if NATS is not available - the local gauge update above already succeeded
+            tracing::trace!("Failed to publish ActiveLoad to NATS for worker {worker:?}: {e:?}");
        }
    }
@@ -1063,6 +1087,12 @@ impl ActiveSequencesMultiWorker {
        self.senders.len()
    }
+    /// Get the worker type for this router ("prefill" or "decode").
+    /// Used for Prometheus metric labeling.
+    pub fn worker_type(&self) -> &'static str {
+        self.worker_type
+    }
    /// Generic method to query all workers with a given command
    async fn query_workers<T: Send + 'static>(
        &self,
@@ -1301,11 +1331,19 @@ mod tests {
                workers_with_configs.clone(),
                true,
                1,
+                crate::discovery::WORKER_TYPE_DECODE,
            )
            .await?,
        );
        let seq_manager_2 = Arc::new(
-            ActiveSequencesMultiWorker::new(component, block_size, workers_with_configs, true, 2)
+            ActiveSequencesMultiWorker::new(
+                component,
+                block_size,
+                workers_with_configs,
+                true,
+                2,
+                crate::discovery::WORKER_TYPE_DECODE,
+            )
            .await?,
        );
@@ -1463,11 +1501,19 @@ mod tests {
                workers_with_configs.clone(),
                true,
                1,
+                crate::discovery::WORKER_TYPE_DECODE,
            )
            .await?,
        );
        let seq_manager_2 = Arc::new(
-            ActiveSequencesMultiWorker::new(component, block_size, workers_with_configs, true, 2)
+            ActiveSequencesMultiWorker::new(
+                component,
+                block_size,
+                workers_with_configs,
+                true,
+                2,
+                crate::discovery::WORKER_TYPE_DECODE,
+            )
            .await?,
        );

--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -72,6 +72,26 @@ pub struct LLMMetricAnnotation {
    pub output_tokens: usize,
    pub chunk_tokens: usize,
    pub cached_tokens: Option<usize>,
+    /// Prefill worker ID (for TTFT attribution in disaggregated mode)
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prefill_worker_id: Option<u64>,
+    /// Prefill worker DP rank
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prefill_dp_rank: Option<u32>,
+    /// Prefill worker type ("prefill" or "decode") for Prometheus metric labeling.
+    /// Stored at routing time to avoid expensive MDC lookup when updating TTFT metrics.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub prefill_worker_type: Option<String>,
+    /// Decode worker ID (for ITL attribution in disaggregated mode)
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub decode_worker_id: Option<u64>,
+    /// Decode worker DP rank
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub decode_dp_rank: Option<u32>,
+    /// Decode worker type ("prefill" or "decode") for Prometheus metric labeling.
+    /// Stored at routing time to avoid expensive MDC lookup when updating ITL metrics.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub decode_worker_type: Option<String>,
 }
 impl LLMMetricAnnotation {
@@ -657,12 +677,32 @@ impl OpenAIPreprocessor {
                            .map_err(|e| e.to_string())
                    });
-                    // Create LLM metrics annotation
+                    // Create LLM metrics annotation with prefill/decode worker info from tracker.
+                    // Worker types are stored at routing time to avoid expensive MDC lookup.
+                    let tracker = inner.response_generator.tracker();
+                    let prefill_worker_id = tracker.as_ref().and_then(|t| t.prefill_worker_id());
+                    let prefill_dp_rank = tracker.as_ref().and_then(|t| t.prefill_dp_rank());
+                    let prefill_worker_type = tracker
+                        .as_ref()
+                        .and_then(|t| t.prefill_worker_type())
+                        .map(String::from);
+                    let decode_worker_id = tracker.as_ref().and_then(|t| t.decode_worker_id());
+                    let decode_dp_rank = tracker.as_ref().and_then(|t| t.decode_dp_rank());
+                    let decode_worker_type = tracker
+                        .as_ref()
+                        .and_then(|t| t.decode_worker_type())
+                        .map(String::from);
                    let llm_metrics = LLMMetricAnnotation {
                        input_tokens: isl,
                        output_tokens: current_osl,
                        chunk_tokens,
                        cached_tokens: None,
+                        prefill_worker_id,
+                        prefill_dp_rank,
+                        prefill_worker_type,
+                        decode_worker_id,
+                        decode_dp_rank,
+                        decode_worker_type,
                    };
                    if let Ok(metrics_annotated) = llm_metrics.to_annotation::<()>() {
@@ -695,6 +735,20 @@ impl OpenAIPreprocessor {
                        let usage_chunk = inner.response_generator.create_usage_chunk();
                        let usage = inner.response_generator.get_usage();
+                        let tracker = inner.response_generator.tracker();
+                        let prefill_worker_id =
+                            tracker.as_ref().and_then(|t| t.prefill_worker_id());
+                        let prefill_dp_rank = tracker.as_ref().and_then(|t| t.prefill_dp_rank());
+                        let prefill_worker_type = tracker
+                            .as_ref()
+                            .and_then(|t| t.prefill_worker_type())
+                            .map(String::from);
+                        let decode_worker_id = tracker.as_ref().and_then(|t| t.decode_worker_id());
+                        let decode_dp_rank = tracker.as_ref().and_then(|t| t.decode_dp_rank());
+                        let decode_worker_type = tracker
+                            .as_ref()
+                            .and_then(|t| t.decode_worker_type())
+                            .map(String::from);
                        let llm_metrics = LLMMetricAnnotation {
                            input_tokens: usage.prompt_tokens as usize,
                            output_tokens: usage.completion_tokens as usize,
@@ -703,6 +757,12 @@ impl OpenAIPreprocessor {
                                .prompt_tokens_details
                                .as_ref()
                                .and_then(|d| d.cached_tokens.map(|c| c as usize)),
+                            prefill_worker_id,
+                            prefill_dp_rank,
+                            prefill_worker_type,
+                            decode_worker_id,
+                            decode_dp_rank,
+                            decode_worker_type,
                        };
                        // Create annotation string

--- a/lib/llm/src/protocols/common/timing.rs
+++ b/lib/llm/src/protocols/common/timing.rs
@@ -6,7 +6,10 @@
 //! This module provides [`RequestTracker`] for tracking timing and routing information
 //! that can be returned to clients via the `nvext` response field.
-use std::sync::{Arc, OnceLock};
+use std::sync::{
+    Arc, OnceLock,
+    atomic::{AtomicU32, AtomicU64, Ordering},
+};
 use std::time::{Instant, SystemTime, UNIX_EPOCH};
 use parking_lot::Mutex;
@@ -16,6 +19,17 @@ use utoipa::ToSchema;
 use crate::protocols::openai::nvext::WorkerIdInfo;
+/// Sentinel value indicating no worker ID has been set.
+/// We use 0 as the sentinel since valid worker IDs are non-zero lease IDs from etcd.
+const NO_WORKER_ID: u64 = 0;
+const NO_DP_RANK: u32 = u32::MAX;
+/// Worker type constants for Prometheus metric labels.
+/// These are stored in RequestTracker at routing time to avoid costly MDC lookups
+/// when updating per-worker metrics (TTFT, ITL).
+pub const WORKER_TYPE_PREFILL: &str = "prefill";
+pub const WORKER_TYPE_DECODE: &str = "decode";
 /// Phase of the request in disaggregated serving.
 ///
 /// Used to determine which worker ID field to record when routing.
@@ -48,10 +62,15 @@ impl std::fmt::Display for RequestPhase {
 /// - `first_token_time`: When the first token was generated (set once via OnceLock)
 /// - `request_finish_time`: When the request finished (set once via OnceLock)
 /// - KV cache hit rate information
+/// - Worker IDs and types for per-worker Prometheus metrics
 ///
 /// The `OnceLock` fields ensure that values are set exactly once,
 /// which is important for disaggregated serving where the "first token"
 /// might appear multiple times.
+///
+/// Worker IDs use `AtomicU64` instead of `OnceLock<u64>` for lower overhead since
+/// the tracker is created for every request. The sentinel value `NO_WORKER_ID` (0)
+/// indicates no worker has been recorded yet.
 #[derive(Debug)]
 pub struct RequestTracker {
    /// When the request was received (monotonic clock for duration calculations)
@@ -75,11 +94,30 @@ pub struct RequestTracker {
    /// Input sequence length in blocks (for hit rate calculation) - set once via OnceLock
    isl_blocks: OnceLock<usize>,
-    /// Prefill worker ID (for disaggregated serving) - set once via OnceLock
+    /// Prefill worker ID (for disaggregated serving).
-    prefill_worker_id: OnceLock<u64>,
+    /// Uses atomic with compare-exchange for set-once semantics.
+    /// Value of 0 (NO_WORKER_ID) means not yet set.
+    prefill_worker_id: AtomicU64,
+    /// Prefill DP rank. Value of u32::MAX (NO_DP_RANK) means not yet set.
+    prefill_dp_rank: AtomicU32,
+    /// Decode worker ID. Value of 0 (NO_WORKER_ID) means not yet set.
+    decode_worker_id: AtomicU64,
+    /// Decode DP rank. Value of u32::MAX (NO_DP_RANK) means not yet set.
+    decode_dp_rank: AtomicU32,
+    /// Worker type for the prefill worker ("prefill" or "decode").
+    /// Stored at routing time to avoid MDC lookup when updating Prometheus metrics.
+    /// In aggregated mode, this will be "decode" since the same worker handles both.
+    /// This is necessary because TTFT metrics need to know the worker type label,
+    /// and looking up MDC by worker_id would require iterating all cards (O(n)).
+    prefill_worker_type: OnceLock<&'static str>,
-    /// Decode worker ID - set once via OnceLock
+    /// Worker type for the decode worker (always "decode").
-    decode_worker_id: OnceLock<u64>,
+    /// Stored for symmetry with prefill_worker_type, though decode is always "decode".
+    decode_worker_type: OnceLock<&'static str>,
    /// Request phase (Prefill/Decode/Aggregated)
    phase: Mutex<RequestPhase>,
@@ -108,8 +146,12 @@ impl RequestTracker {
            request_finish_time: OnceLock::new(),
            kv_overlap_blocks: OnceLock::new(),
            isl_blocks: OnceLock::new(),
-            prefill_worker_id: OnceLock::new(),
+            prefill_worker_id: AtomicU64::new(NO_WORKER_ID),
-            decode_worker_id: OnceLock::new(),
+            prefill_dp_rank: AtomicU32::new(NO_DP_RANK),
+            decode_worker_id: AtomicU64::new(NO_WORKER_ID),
+            decode_dp_rank: AtomicU32::new(NO_DP_RANK),
+            prefill_worker_type: OnceLock::new(),
+            decode_worker_type: OnceLock::new(),
            phase: Mutex::new(RequestPhase::Aggregated),
            phase_semaphore: Arc::new(Semaphore::new(1)),
        }
@@ -177,12 +219,82 @@ impl RequestTracker {
    /// Record the prefill worker ID. Returns true if this was the first call.
    pub fn record_prefill_worker(&self, id: u64) -> bool {
-        self.prefill_worker_id.set(id).is_ok()
+        self.prefill_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok()
+    }
+    /// Record the prefill worker ID and DP rank. Returns true if worker_id was recorded for the first time.
+    /// Only sets the dp_rank if the worker_id is newly set to avoid mismatched worker_id/dp_rank pairs.
+    pub fn record_prefill_worker_with_rank(&self, id: u64, dp_rank: u32) -> bool {
+        let is_new = self
+            .prefill_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok();
+        if is_new {
+            self.prefill_dp_rank.store(dp_rank, Ordering::SeqCst);
+        }
+        is_new
+    }
+    /// Record the prefill worker ID, DP rank, and worker type.
+    /// The worker_type is stored to avoid MDC lookup when updating Prometheus metrics.
+    /// Returns true if worker_id was recorded for the first time.
+    pub fn record_prefill_worker_full(
+        &self,
+        id: u64,
+        dp_rank: u32,
+        worker_type: &'static str,
+    ) -> bool {
+        let is_new = self
+            .prefill_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok();
+        if is_new {
+            self.prefill_dp_rank.store(dp_rank, Ordering::SeqCst);
+            let _ = self.prefill_worker_type.set(worker_type);
+        }
+        is_new
    }
    /// Record the decode worker ID. Returns true if this was the first call.
    pub fn record_decode_worker(&self, id: u64) -> bool {
-        self.decode_worker_id.set(id).is_ok()
+        self.decode_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok()
+    }
+    /// Record the decode worker ID and DP rank. Returns true if worker_id was recorded for the first time.
+    /// Only sets the dp_rank if the worker_id is newly set to avoid mismatched worker_id/dp_rank pairs.
+    pub fn record_decode_worker_with_rank(&self, id: u64, dp_rank: u32) -> bool {
+        let is_new = self
+            .decode_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok();
+        if is_new {
+            self.decode_dp_rank.store(dp_rank, Ordering::SeqCst);
+        }
+        is_new
+    }
+    /// Record the decode worker ID, DP rank, and worker type.
+    /// The worker_type is stored to avoid MDC lookup when updating Prometheus metrics.
+    /// Returns true if worker_id was recorded for the first time.
+    pub fn record_decode_worker_full(
+        &self,
+        id: u64,
+        dp_rank: u32,
+        worker_type: &'static str,
+    ) -> bool {
+        let is_new = self
+            .decode_worker_id
+            .compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
+            .is_ok();
+        if is_new {
+            self.decode_dp_rank.store(dp_rank, Ordering::SeqCst);
+            let _ = self.decode_worker_type.set(worker_type);
+        }
+        is_new
    }
    /// Set the request phase and return a permit that blocks subsequent phase changes.
@@ -230,10 +342,56 @@ impl RequestTracker {
        }
    }
+    /// Record worker ID and DP rank based on the current phase.
+    ///
+    /// - Prefill phase: records as prefill_worker_id/prefill_dp_rank
+    /// - Decode phase: records as decode_worker_id/decode_dp_rank
+    /// - Aggregated phase: records as both prefill and decode worker/rank
+    pub fn record_worker_with_rank(&self, instance_id: u64, dp_rank: u32) {
+        match self.phase() {
+            RequestPhase::Prefill => {
+                self.record_prefill_worker_with_rank(instance_id, dp_rank);
+            }
+            RequestPhase::Decode => {
+                self.record_decode_worker_with_rank(instance_id, dp_rank);
+            }
+            RequestPhase::Aggregated => {
+                self.record_prefill_worker_with_rank(instance_id, dp_rank);
+                self.record_decode_worker_with_rank(instance_id, dp_rank);
+            }
+        }
+    }
+    /// Record worker ID, DP rank, and worker type based on the current phase.
+    ///
+    /// This is the preferred method when worker_type is known (from MDC or router config),
+    /// as it stores the worker_type for later use in Prometheus metric updates without
+    /// requiring an expensive MDC lookup.
+    ///
+    /// - Prefill phase: records as prefill worker with given worker_type
+    /// - Decode phase: records as decode worker with given worker_type
+    /// - Aggregated phase: records as both prefill and decode worker with the same worker_type
+    pub fn record_worker_full(&self, instance_id: u64, dp_rank: u32, worker_type: &'static str) {
+        match self.phase() {
+            RequestPhase::Prefill => {
+                self.record_prefill_worker_full(instance_id, dp_rank, worker_type);
+            }
+            RequestPhase::Decode => {
+                self.record_decode_worker_full(instance_id, dp_rank, worker_type);
+            }
+            RequestPhase::Aggregated => {
+                // In aggregated mode, both prefill and decode happen on the same worker,
+                // so we record the same worker_type for both
+                self.record_prefill_worker_full(instance_id, dp_rank, worker_type);
+                self.record_decode_worker_full(instance_id, dp_rank, worker_type);
+            }
+        }
+    }
    /// Get worker ID information if any worker IDs have been recorded.
    pub fn get_worker_info(&self) -> Option<WorkerIdInfo> {
-        let prefill = self.prefill_worker_id.get().copied();
+        let prefill = self.prefill_worker_id();
-        let decode = self.decode_worker_id.get().copied();
+        let decode = self.decode_worker_id();
        if prefill.is_none() && decode.is_none() {
            return None;
@@ -241,10 +399,46 @@ impl RequestTracker {
        Some(WorkerIdInfo {
            prefill_worker_id: prefill,
+            prefill_dp_rank: self.prefill_dp_rank(),
            decode_worker_id: decode,
+            decode_dp_rank: self.decode_dp_rank(),
        })
    }
+    /// Get the decode worker ID if recorded.
+    pub fn decode_worker_id(&self) -> Option<u64> {
+        let id = self.decode_worker_id.load(Ordering::SeqCst);
+        if id == NO_WORKER_ID { None } else { Some(id) }
+    }
+    /// Get the decode DP rank if recorded.
+    pub fn decode_dp_rank(&self) -> Option<u32> {
+        let rank = self.decode_dp_rank.load(Ordering::SeqCst);
+        if rank == NO_DP_RANK { None } else { Some(rank) }
+    }
+    /// Get the prefill worker ID if recorded.
+    pub fn prefill_worker_id(&self) -> Option<u64> {
+        let id = self.prefill_worker_id.load(Ordering::SeqCst);
+        if id == NO_WORKER_ID { None } else { Some(id) }
+    }
+    /// Get the prefill DP rank if recorded.
+    pub fn prefill_dp_rank(&self) -> Option<u32> {
+        let rank = self.prefill_dp_rank.load(Ordering::SeqCst);
+        if rank == NO_DP_RANK { None } else { Some(rank) }
+    }
+    /// Get the prefill worker type if recorded.
+    pub fn prefill_worker_type(&self) -> Option<&'static str> {
+        self.prefill_worker_type.get().copied()
+    }
+    /// Get the decode worker type if recorded.
+    pub fn decode_worker_type(&self) -> Option<&'static str> {
+        self.decode_worker_type.get().copied()
+    }
    pub fn get_timing_info(&self) -> TimingInfo {
        TimingInfo {
            request_received_ms: self.request_received_epoch_ms,

--- a/lib/llm/src/protocols/openai.rs
+++ b/lib/llm/src/protocols/openai.rs
@@ -232,6 +232,11 @@ pub trait DeltaGeneratorExt<ResponseType: Send + 'static + std::fmt::Debug>:
    /// Get the current usage statistics with properly calculated total_tokens.
    fn get_usage(&self) -> dynamo_async_openai::types::CompletionUsage;
+    /// Returns the request tracker if available, for accessing worker timing metrics.
+    fn tracker(&self) -> Option<std::sync::Arc<common::timing::RequestTracker>> {
+        None
+    }
 }
 #[derive(Clone, Debug, Serialize, Deserialize, Default)]

--- a/lib/llm/src/protocols/openai/chat_completions/delta.rs
+++ b/lib/llm/src/protocols/openai/chat_completions/delta.rs
@@ -153,12 +153,9 @@ impl DeltaGenerator {
        let chatcmpl_id = format!("chatcmpl-{request_id}");
-        // Create request tracker if tracking is enabled
+        // Always create request tracker for per-worker metrics (TTFT, ITL per worker_id).
-        let tracker = if options.enable_tracking {
+        // The enable_tracking option only controls whether timing info is included in the response.
-            Some(Arc::new(RequestTracker::new()))
+        let tracker = Some(Arc::new(RequestTracker::new()));
-        } else {
-            None
-        };
        Self {
            id: chatcmpl_id,
@@ -504,6 +501,10 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
    fn get_usage(&self) -> dynamo_async_openai::types::CompletionUsage {
        DeltaGenerator::get_usage(self)
    }
+    fn tracker(&self) -> Option<std::sync::Arc<crate::protocols::common::timing::RequestTracker>> {
+        self.tracker.clone()
+    }
 }
 #[cfg(test)]

--- a/lib/llm/src/protocols/openai/completions/delta.rs
+++ b/lib/llm/src/protocols/openai/completions/delta.rs
@@ -120,12 +120,9 @@ impl DeltaGenerator {
        let completion_id = format!("cmpl-{request_id}");
-        // Create request tracker if tracking is enabled
+        // Always create request tracker for per-worker metrics (TTFT, ITL per worker_id).
-        let tracker = if options.enable_tracking {
+        // The enable_tracking option only controls whether timing info is included in the response.
-            Some(Arc::new(RequestTracker::new()))
+        let tracker = Some(Arc::new(RequestTracker::new()));
-        } else {
-            None
-        };
        Self {
            id: completion_id,
@@ -398,4 +395,8 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
    fn get_usage(&self) -> dynamo_async_openai::types::CompletionUsage {
        DeltaGenerator::get_usage(self)
    }
+    fn tracker(&self) -> Option<std::sync::Arc<crate::protocols::common::timing::RequestTracker>> {
+        self.tracker.clone()
+    }
 }