feat: Add per-worker Prometheus metrics for router load monitoring (#5842)

Signed-off-by: hongkuanz <hongkuanz@nvidia.com>

feat: Add per-worker Prometheus metrics for router load monitoring (#5842)
Signed-off-by: hongkuanz <hongkuanz@nvidia.com>
0c0336e6 · Hongkuan Zhou · GitHub · eff08aed · 0c0336e6 · 0c0336e6
Unverified Commit 0c0336e6 authored Feb 04, 2026 by Hongkuan Zhou Committed by GitHub Feb 04, 2026
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 0 deletions

lib/llm/src/protocols/openai/nvext.rs lib/llm/src/protocols/openai/nvext.rs +8 -0

lib/runtime/src/metrics/prometheus_names.rs lib/runtime/src/metrics/prometheus_names.rs +23 -0

No files found.
--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -75,9 +75,17 @@ pub struct WorkerIdInfo {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

+    /// The prefill worker's data parallel rank
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prefill_dp_rank: Option<u32>,
+
    /// The decode worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
+
+    /// The decode worker's data parallel rank
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub decode_dp_rank: Option<u32>,
 }

 /// NVIDIA LLM response extensions

--- a/lib/runtime/src/metrics/prometheus_names.rs
+++ b/lib/runtime/src/metrics/prometheus_names.rs
@@ -150,6 +150,29 @@ pub mod frontend_service {
    /// Total number of request migrations due to worker unavailability
    pub const MODEL_MIGRATION_TOTAL: &str = "model_migration_total";

+    /// Active decode blocks (KV cache blocks) per worker
+    /// Gauge metric tracking current KV cache block utilization for each worker
+    pub const WORKER_ACTIVE_DECODE_BLOCKS: &str = "worker_active_decode_blocks";
+
+    /// Active prefill tokens per worker
+    /// Gauge metric tracking current queued prefill tokens for each worker
+    pub const WORKER_ACTIVE_PREFILL_TOKENS: &str = "worker_active_prefill_tokens";
+
+    /// Last observed time to first token per worker (in seconds)
+    /// Gauge metric tracking the most recent TTFT for each worker
+    pub const WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS: &str =
+        "worker_last_time_to_first_token_seconds";
+
+    /// Last observed input sequence tokens per worker
+    /// Gauge metric tracking the input token count from the same request as WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS
+    /// Updated atomically with TTFT to correlate latency with input size
+    pub const WORKER_LAST_INPUT_SEQUENCE_TOKENS: &str = "worker_last_input_sequence_tokens";
+
+    /// Last observed inter-token latency per worker (in seconds)
+    /// Gauge metric tracking the most recent ITL for each worker
+    pub const WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS: &str =
+        "worker_last_inter_token_latency_seconds";
+
    /// Label name for the type of migration
    pub const MIGRATION_TYPE_LABEL: &str = "migration_type";