Unverified Commit 0c0336e6 authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: Add per-worker Prometheus metrics for router load monitoring (#5842)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent eff08aed
...@@ -75,9 +75,17 @@ pub struct WorkerIdInfo { ...@@ -75,9 +75,17 @@ pub struct WorkerIdInfo {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub prefill_worker_id: Option<u64>, pub prefill_worker_id: Option<u64>,
/// The prefill worker's data parallel rank
#[serde(skip_serializing_if = "Option::is_none")]
pub prefill_dp_rank: Option<u32>,
/// The decode worker ID that processed this request /// The decode worker ID that processed this request
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub decode_worker_id: Option<u64>, pub decode_worker_id: Option<u64>,
/// The decode worker's data parallel rank
#[serde(skip_serializing_if = "Option::is_none")]
pub decode_dp_rank: Option<u32>,
} }
/// NVIDIA LLM response extensions /// NVIDIA LLM response extensions
......
...@@ -150,6 +150,29 @@ pub mod frontend_service { ...@@ -150,6 +150,29 @@ pub mod frontend_service {
/// Total number of request migrations due to worker unavailability /// Total number of request migrations due to worker unavailability
pub const MODEL_MIGRATION_TOTAL: &str = "model_migration_total"; pub const MODEL_MIGRATION_TOTAL: &str = "model_migration_total";
/// Active decode blocks (KV cache blocks) per worker
/// Gauge metric tracking current KV cache block utilization for each worker
pub const WORKER_ACTIVE_DECODE_BLOCKS: &str = "worker_active_decode_blocks";
/// Active prefill tokens per worker
/// Gauge metric tracking current queued prefill tokens for each worker
pub const WORKER_ACTIVE_PREFILL_TOKENS: &str = "worker_active_prefill_tokens";
/// Last observed time to first token per worker (in seconds)
/// Gauge metric tracking the most recent TTFT for each worker
pub const WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS: &str =
"worker_last_time_to_first_token_seconds";
/// Last observed input sequence tokens per worker
/// Gauge metric tracking the input token count from the same request as WORKER_LAST_TIME_TO_FIRST_TOKEN_SECONDS
/// Updated atomically with TTFT to correlate latency with input size
pub const WORKER_LAST_INPUT_SEQUENCE_TOKENS: &str = "worker_last_input_sequence_tokens";
/// Last observed inter-token latency per worker (in seconds)
/// Gauge metric tracking the most recent ITL for each worker
pub const WORKER_LAST_INTER_TOKEN_LATENCY_SECONDS: &str =
"worker_last_inter_token_latency_seconds";
/// Label name for the type of migration /// Label name for the type of migration
pub const MIGRATION_TYPE_LABEL: &str = "migration_type"; pub const MIGRATION_TYPE_LABEL: &str = "migration_type";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment