"docs/vscode:/vscode.git/clone" did not exist on "ee33acf3c3c3b656981b7dca914465d1d8f4c811"
Unverified Commit f46720c9 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat: add router-level Prometheus metrics and centralize request tracking (#6146)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
Co-authored-by: default avatarCursor <cursoragent@cursor.com>
parent 2cee89a0
...@@ -947,6 +947,7 @@ impl KvPushRouter { ...@@ -947,6 +947,7 @@ impl KvPushRouter {
tokio::spawn(async move { tokio::spawn(async move {
let mut stream = stream; let mut stream = stream;
let mut first_item = true; let mut first_item = true;
let mut first_token_gauges_observed = false;
while let Some(mut response) = stream.next().await { while let Some(mut response) = stream.next().await {
// Inject worker_id into first response if tracker is available // Inject worker_id into first response if tracker is available
...@@ -957,6 +958,21 @@ impl KvPushRouter { ...@@ -957,6 +958,21 @@ impl KvPushRouter {
} }
} }
// Observe per-worker TTFT/ISL gauges on first response with actual tokens
if !first_token_gauges_observed {
let has_tokens = response
.data
.as_ref()
.map(|d| !d.token_ids.is_empty())
.unwrap_or(false);
if has_tokens {
if let Some(ref tracker) = tracker {
tracker.observe_first_token_gauges();
}
first_token_gauges_observed = true;
}
}
// Convert LLMEngineOutput to PyObject // Convert LLMEngineOutput to PyObject
let py_response = Python::with_gil(|py| { let py_response = Python::with_gil(|py| {
pythonize(py, &response.data) pythonize(py, &response.data)
...@@ -976,6 +992,11 @@ impl KvPushRouter { ...@@ -976,6 +992,11 @@ impl KvPushRouter {
} }
} }
} }
// Observe per-worker ITL gauge at stream end
if let Some(ref tracker) = tracker {
tracker.observe_finish_gauges();
}
}); });
// Return a Python async generator wrapper // Return a Python async generator wrapper
......
...@@ -12,7 +12,5 @@ pub use watcher::{ModelUpdate, ModelWatcher}; ...@@ -12,7 +12,5 @@ pub use watcher::{ModelUpdate, ModelWatcher};
mod worker_monitor; mod worker_monitor;
pub use worker_monitor::{ pub use worker_monitor::{
KvWorkerMonitor, LoadThresholdConfig, WORKER_ACTIVE_DECODE_BLOCKS_GAUGE, KvWorkerMonitor, LoadThresholdConfig, WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL, WorkerLoadState,
WORKER_ACTIVE_PREFILL_TOKENS_GAUGE, WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL, WorkerLoadState,
register_worker_load_metrics,
}; };
...@@ -3,13 +3,12 @@ ...@@ -3,13 +3,12 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::sync::Arc; use std::sync::Arc;
use std::sync::RwLock;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering}; use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use std::sync::{LazyLock, RwLock};
use tokio::sync::Notify; use tokio::sync::Notify;
use dashmap::DashMap; use dashmap::DashMap;
use prometheus::{IntGaugeVec, Opts, Registry};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::http::service::metrics::{ use crate::http::service::metrics::{
...@@ -17,11 +16,11 @@ use crate::http::service::metrics::{ ...@@ -17,11 +16,11 @@ use crate::http::service::metrics::{
WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE, WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE,
}; };
use crate::kv_router::KV_METRICS_SUBJECT; use crate::kv_router::KV_METRICS_SUBJECT;
use crate::kv_router::metrics::WORKER_LOAD_METRICS;
use crate::kv_router::protocols::ActiveLoad; use crate::kv_router::protocols::ActiveLoad;
use crate::model_card::ModelDeploymentCard; use crate::model_card::ModelDeploymentCard;
use dynamo_runtime::component::Client; use dynamo_runtime::component::Client;
use dynamo_runtime::discovery::{DiscoveryQuery, watch_and_extract_field}; use dynamo_runtime::discovery::{DiscoveryQuery, watch_and_extract_field};
use dynamo_runtime::metrics::prometheus_names::frontend_service;
use dynamo_runtime::pipeline::{WorkerLoadMonitor, async_trait}; use dynamo_runtime::pipeline::{WorkerLoadMonitor, async_trait};
use dynamo_runtime::traits::DistributedRuntimeProvider; use dynamo_runtime::traits::DistributedRuntimeProvider;
use dynamo_runtime::transports::event_plane::EventSubscriber; use dynamo_runtime::transports::event_plane::EventSubscriber;
...@@ -29,62 +28,18 @@ use dynamo_runtime::transports::event_plane::EventSubscriber; ...@@ -29,62 +28,18 @@ use dynamo_runtime::transports::event_plane::EventSubscriber;
// Re-export worker type constants from timing.rs (single source of truth) // Re-export worker type constants from timing.rs (single source of truth)
pub use crate::protocols::common::timing::{WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL}; pub use crate::protocols::common::timing::{WORKER_TYPE_DECODE, WORKER_TYPE_PREFILL};
/// Global Prometheus gauge for active decode blocks per worker (labels: worker_id, dp_rank, worker_type)
/// This is shared across all KvWorkerMonitor instances.
pub static WORKER_ACTIVE_DECODE_BLOCKS_GAUGE: LazyLock<IntGaugeVec> = LazyLock::new(|| {
IntGaugeVec::new(
Opts::new(
format!(
"dynamo_frontend_{}",
frontend_service::WORKER_ACTIVE_DECODE_BLOCKS
),
"Active KV cache decode blocks per worker",
),
&["worker_id", "dp_rank", "worker_type"],
)
.expect("Failed to create worker_active_decode_blocks gauge")
});
/// Global Prometheus gauge for active prefill tokens per worker (labels: worker_id, dp_rank, worker_type)
/// This is shared across all KvWorkerMonitor instances.
pub static WORKER_ACTIVE_PREFILL_TOKENS_GAUGE: LazyLock<IntGaugeVec> = LazyLock::new(|| {
IntGaugeVec::new(
Opts::new(
format!(
"dynamo_frontend_{}",
frontend_service::WORKER_ACTIVE_PREFILL_TOKENS
),
"Active prefill tokens queued per worker",
),
&["worker_id", "dp_rank", "worker_type"],
)
.expect("Failed to create worker_active_prefill_tokens gauge")
});
/// Register the global worker load Prometheus metrics with the given registry.
///
/// This should be called once during HTTP service setup to expose the worker load
/// metrics via the `/metrics` endpoint.
///
/// # Errors
/// Returns an error if the metrics are already registered with the registry.
pub fn register_worker_load_metrics(registry: &Registry) -> Result<(), prometheus::Error> {
registry.register(Box::new(WORKER_ACTIVE_DECODE_BLOCKS_GAUGE.clone()))?;
registry.register(Box::new(WORKER_ACTIVE_PREFILL_TOKENS_GAUGE.clone()))?;
Ok(())
}
/// Clean up all Prometheus metrics for a worker across the specified dp_ranks. /// Clean up all Prometheus metrics for a worker across the specified dp_ranks.
/// ///
/// This removes metrics with the given worker_id, dp_rank, and worker_type label combination. /// This removes metrics with the given worker_id, dp_rank, and worker_type label combination.
/// Called when workers are removed to prevent stale metrics from accumulating. /// Called when workers are removed to prevent stale metrics from accumulating.
fn cleanup_worker_metrics(worker_id: u64, dp_ranks: &[u32], worker_type: &str) { fn cleanup_worker_metrics(worker_id: u64, dp_ranks: &[u32], worker_type: &str) {
let worker_id_str = worker_id.to_string(); let worker_id_str = worker_id.to_string();
let m = &*WORKER_LOAD_METRICS;
for dp_rank in dp_ranks { for dp_rank in dp_ranks {
let dp_rank_str = dp_rank.to_string(); let dp_rank_str = dp_rank.to_string();
let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type]; let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type];
let _ = WORKER_ACTIVE_DECODE_BLOCKS_GAUGE.remove_label_values(labels); let _ = m.active_decode_blocks.remove_label_values(labels);
let _ = WORKER_ACTIVE_PREFILL_TOKENS_GAUGE.remove_label_values(labels); let _ = m.active_prefill_tokens.remove_label_values(labels);
let _ = WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE.remove_label_values(labels); let _ = WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE.remove_label_values(labels);
let _ = WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE.remove_label_values(labels); let _ = WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE.remove_label_values(labels);
let _ = WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE.remove_label_values(labels); let _ = WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE.remove_label_values(labels);
...@@ -215,9 +170,9 @@ impl WorkerLoadState { ...@@ -215,9 +170,9 @@ impl WorkerLoadState {
/// Cloning shares state via internal Arc-wrapped fields. This allows multiple pipelines /// Cloning shares state via internal Arc-wrapped fields. This allows multiple pipelines
/// (e.g., chat and completions) to share the same monitor instance. /// (e.g., chat and completions) to share the same monitor instance.
/// ///
/// Prometheus metrics are exposed via the global gauges [`WORKER_ACTIVE_DECODE_BLOCKS_GAUGE`] /// Prometheus metrics are exposed via [`WORKER_LOAD_METRICS`] (defined in `kv_router::sequence`),
/// and [`WORKER_ACTIVE_PREFILL_TOKENS_GAUGE`], which should be registered with the HTTP /// which should be registered with the HTTP service's Prometheus registry using
/// service's Prometheus registry using [`register_worker_load_metrics`]. /// [`register_worker_load_metrics`](crate::kv_router::metrics::register_worker_load_metrics).
/// ///
/// In disaggregated mode, use `set_prefill_client` to register the prefill endpoint for /// In disaggregated mode, use `set_prefill_client` to register the prefill endpoint for
/// proper TTFT metric cleanup when prefill workers are removed. /// proper TTFT metric cleanup when prefill workers are removed.
...@@ -251,8 +206,9 @@ impl KvWorkerMonitor { ...@@ -251,8 +206,9 @@ impl KvWorkerMonitor {
/// - `active_prefill_tokens_threshold`: DEFAULT_MAX_TOKENS (effectively disabled) /// - `active_prefill_tokens_threshold`: DEFAULT_MAX_TOKENS (effectively disabled)
/// - `active_prefill_tokens_threshold_frac`: 1.5 (effectively disabled) /// - `active_prefill_tokens_threshold_frac`: 1.5 (effectively disabled)
/// ///
/// Prometheus metrics are exposed via the global gauges and should be registered /// Prometheus metrics are exposed via [`WORKER_LOAD_METRICS`] and should be registered
/// using [`register_worker_load_metrics`] during HTTP service setup. /// using [`register_worker_load_metrics`](crate::kv_router::metrics::register_worker_load_metrics)
/// during HTTP service setup.
/// ///
/// For disaggregated mode, call `set_prefill_client` after creation to enable /// For disaggregated mode, call `set_prefill_client` after creation to enable
/// proper TTFT metric cleanup when prefill workers are removed. /// proper TTFT metric cleanup when prefill workers are removed.
......
...@@ -111,7 +111,7 @@ pub fn register_worker_timing_metrics(registry: &Registry) -> Result<(), prometh ...@@ -111,7 +111,7 @@ pub fn register_worker_timing_metrics(registry: &Registry) -> Result<(), prometh
/// # Note /// # Note
/// With 2 significant figures, there are roughly 90 unique values per order of magnitude. /// With 2 significant figures, there are roughly 90 unique values per order of magnitude.
/// Requesting more buckets than can be uniquely represented will result in deduplication. /// Requesting more buckets than can be uniquely represented will result in deduplication.
fn generate_log_buckets(min: f64, max: f64, count: usize) -> Vec<f64> { pub fn generate_log_buckets(min: f64, max: f64, count: usize) -> Vec<f64> {
if count == 0 { if count == 0 {
return vec![]; return vec![];
} }
...@@ -153,7 +153,7 @@ fn generate_log_buckets(min: f64, max: f64, count: usize) -> Vec<f64> { ...@@ -153,7 +153,7 @@ fn generate_log_buckets(min: f64, max: f64, count: usize) -> Vec<f64> {
} }
/// Round a number to a specified number of significant figures /// Round a number to a specified number of significant figures
fn round_to_sig_figs(value: f64, sig_figs: u32) -> f64 { pub fn round_to_sig_figs(value: f64, sig_figs: u32) -> f64 {
if value == 0.0 { if value == 0.0 {
return 0.0; return 0.0;
} }
......
...@@ -16,8 +16,9 @@ use super::Metrics; ...@@ -16,8 +16,9 @@ use super::Metrics;
use super::RouteDoc; use super::RouteDoc;
use super::metrics; use super::metrics;
use super::metrics::register_worker_timing_metrics; use super::metrics::register_worker_timing_metrics;
use crate::discovery::{ModelManager, register_worker_load_metrics}; use crate::discovery::ModelManager;
use crate::endpoint_type::EndpointType; use crate::endpoint_type::EndpointType;
use crate::kv_router::metrics::{register_routing_overhead_metrics, register_worker_load_metrics};
use crate::request_template::RequestTemplate; use crate::request_template::RequestTemplate;
use anyhow::Result; use anyhow::Result;
use axum_server::tls_rustls::RustlsConfig; use axum_server::tls_rustls::RustlsConfig;
...@@ -391,6 +392,12 @@ impl HttpServiceConfigBuilder { ...@@ -391,6 +392,12 @@ impl HttpServiceConfigBuilder {
tracing::warn!("Failed to register worker timing metrics: {}", e); tracing::warn!("Failed to register worker timing metrics: {}", e);
} }
// Register routing overhead metrics (block hashing, find matches, scheduling latencies)
// These are updated by KvRouter::find_best_match on every routing decision
if let Err(e) = register_routing_overhead_metrics(&registry) {
tracing::warn!("Failed to register routing overhead metrics: {}", e);
}
let mut router = axum::Router::new(); let mut router = axum::Router::new();
let mut all_docs = Vec::new(); let mut all_docs = Vec::new();
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use std::collections::HashMap; use std::collections::HashMap;
use std::time::Duration; use std::time::{Duration, Instant};
#[cfg(feature = "bench")]
use std::time::Instant;
use anyhow::Result; use anyhow::Result;
use dynamo_runtime::{ use dynamo_runtime::{
...@@ -27,6 +25,7 @@ pub use dynamo_kv_router::indexer; ...@@ -27,6 +25,7 @@ pub use dynamo_kv_router::indexer;
pub use dynamo_kv_router::protocols; pub use dynamo_kv_router::protocols;
pub mod config; pub mod config;
pub mod metrics;
pub mod prefill_router; pub mod prefill_router;
pub mod publisher; pub mod publisher;
pub mod push_router; pub mod push_router;
...@@ -63,7 +62,6 @@ pub const KV_METRICS_ENDPOINT: &str = "load_metrics"; ...@@ -63,7 +62,6 @@ pub const KV_METRICS_ENDPOINT: &str = "load_metrics";
// for metric publishing (push-based) // for metric publishing (push-based)
pub const KV_EVENT_SUBJECT: &str = "kv-events"; pub const KV_EVENT_SUBJECT: &str = "kv-events";
pub const KV_HIT_RATE_SUBJECT: &str = "kv-hit-rate";
pub const KV_METRICS_SUBJECT: &str = "kv_metrics"; pub const KV_METRICS_SUBJECT: &str = "kv_metrics";
// for inter-router comms // for inter-router comms
...@@ -316,7 +314,6 @@ impl KvRouter { ...@@ -316,7 +314,6 @@ impl KvRouter {
update_states: bool, update_states: bool,
lora_name: Option<String>, lora_name: Option<String>,
) -> anyhow::Result<(WorkerWithDpRank, u32)> { ) -> anyhow::Result<(WorkerWithDpRank, u32)> {
#[cfg(feature = "bench")]
let start = Instant::now(); let start = Instant::now();
if update_states && context_id.is_none() { if update_states && context_id.is_none() {
...@@ -326,16 +323,16 @@ impl KvRouter { ...@@ -326,16 +323,16 @@ impl KvRouter {
let isl_tokens = tokens.len(); let isl_tokens = tokens.len();
let block_hashes = compute_block_hash_for_seq(tokens, self.block_size, None); let block_hashes = compute_block_hash_for_seq(tokens, self.block_size, None);
#[cfg(feature = "bench")]
let hash_elapsed = start.elapsed(); let hash_elapsed = start.elapsed();
let overlap_scores = self.indexer.find_matches(block_hashes).await?; let overlap_scores = self.indexer.find_matches(block_hashes).await?;
#[cfg(feature = "bench")]
let find_matches_elapsed = start.elapsed(); let find_matches_elapsed = start.elapsed();
// Compute seq_hashes only if scheduler needs it for active blocks tracking // Compute seq_hashes only if scheduler needs it for active blocks tracking
let maybe_seq_hashes = self let maybe_seq_hashes = self
.kv_router_config .kv_router_config
.compute_seq_hashes_for_tracking(tokens, self.block_size); .compute_seq_hashes_for_tracking(tokens, self.block_size);
let seq_hash_elapsed = start.elapsed();
let best_worker = self let best_worker = self
.scheduler .scheduler
...@@ -349,19 +346,25 @@ impl KvRouter { ...@@ -349,19 +346,25 @@ impl KvRouter {
lora_name, lora_name,
) )
.await?; .await?;
let total_elapsed = start.elapsed();
metrics::ROUTING_OVERHEAD_METRICS.observe(
hash_elapsed,
find_matches_elapsed,
seq_hash_elapsed,
total_elapsed,
);
#[cfg(feature = "bench")] #[cfg(feature = "bench")]
{ tracing::info!(
let total_elapsed = start.elapsed(); isl_tokens,
tracing::info!( hash_us = hash_elapsed.as_micros() as u64,
isl_tokens, find_matches_us = (find_matches_elapsed - hash_elapsed).as_micros() as u64,
hash_us = hash_elapsed.as_micros() as u64, seq_hash_us = (seq_hash_elapsed - find_matches_elapsed).as_micros() as u64,
find_matches_us = (find_matches_elapsed - hash_elapsed).as_micros() as u64, schedule_us = (total_elapsed - seq_hash_elapsed).as_micros() as u64,
schedule_us = (total_elapsed - find_matches_elapsed).as_micros() as u64, total_us = total_elapsed.as_micros() as u64,
total_us = total_elapsed.as_micros() as u64, "find_best_match completed"
"find_best_match completed" );
);
}
// Note: Routing decision recording (for approximate mode) is now handled // Note: Routing decision recording (for approximate mode) is now handled
// by KvPushRouter::generate after select_worker returns. // by KvPushRouter::generate after select_worker returns.
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Prometheus metrics for the KV router.
//!
//! This module centralizes all router-side Prometheus metric definitions:
//! - [`WorkerLoadMetrics`]: Per-worker active decode blocks and prefill tokens gauges.
//! - [`RoutingOverheadMetrics`]: Per-request routing phase latency histograms.
use std::sync::{Arc, LazyLock, OnceLock};
use std::time::Duration;
use dynamo_runtime::component::Component;
use dynamo_runtime::metrics::MetricsHierarchy;
use dynamo_runtime::metrics::prometheus_names::{
frontend_service, labels, name_prefix, routing_overhead,
};
use prometheus::{IntGaugeVec, Opts};
use crate::http::service::metrics::generate_log_buckets;
// ---------------------------------------------------------------------------
// Worker load metrics (gauges)
// ---------------------------------------------------------------------------
/// Per-worker active load gauges, published by `ActiveSequencesMultiWorker`
/// and cleaned up by `KvWorkerMonitor` when workers disappear.
pub struct WorkerLoadMetrics {
pub active_decode_blocks: IntGaugeVec,
pub active_prefill_tokens: IntGaugeVec,
}
impl WorkerLoadMetrics {
pub fn observe(
&self,
worker_id: u64,
dp_rank: u32,
worker_type: &str,
active_blocks: usize,
active_tokens: usize,
) {
let worker_id_str = worker_id.to_string();
let dp_rank_str = dp_rank.to_string();
let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type];
self.active_decode_blocks
.with_label_values(labels)
.set(active_blocks as i64);
self.active_prefill_tokens
.with_label_values(labels)
.set(active_tokens as i64);
}
}
pub static WORKER_LOAD_METRICS: LazyLock<WorkerLoadMetrics> = LazyLock::new(|| WorkerLoadMetrics {
active_decode_blocks: IntGaugeVec::new(
Opts::new(
format!(
"{}_{}",
name_prefix::FRONTEND,
frontend_service::WORKER_ACTIVE_DECODE_BLOCKS
),
"Active KV cache decode blocks per worker",
),
&[labels::WORKER_ID, labels::DP_RANK, labels::WORKER_TYPE],
)
.expect("Failed to create worker_active_decode_blocks gauge"),
active_prefill_tokens: IntGaugeVec::new(
Opts::new(
format!(
"{}_{}",
name_prefix::FRONTEND,
frontend_service::WORKER_ACTIVE_PREFILL_TOKENS
),
"Active prefill tokens queued per worker",
),
&[labels::WORKER_ID, labels::DP_RANK, labels::WORKER_TYPE],
)
.expect("Failed to create worker_active_prefill_tokens gauge"),
});
/// Register the worker load gauges with the given Prometheus registry.
pub fn register_worker_load_metrics(
registry: &prometheus::Registry,
) -> Result<(), prometheus::Error> {
let m = &*WORKER_LOAD_METRICS;
registry.register(Box::new(m.active_decode_blocks.clone()))?;
registry.register(Box::new(m.active_prefill_tokens.clone()))?;
Ok(())
}
// ---------------------------------------------------------------------------
// Routing overhead metrics (histograms)
// ---------------------------------------------------------------------------
/// Per-request routing phase latency histograms (milliseconds).
pub struct RoutingOverheadMetrics {
pub block_hashing: prometheus::Histogram,
pub indexer_find_matches: prometheus::Histogram,
pub seq_hashing: prometheus::Histogram,
pub scheduling: prometheus::Histogram,
pub total: prometheus::Histogram,
}
pub static ROUTING_OVERHEAD_METRICS: LazyLock<RoutingOverheadMetrics> = LazyLock::new(|| {
// Buckets from 0.0001ms (0.1μs) to ~10ms, exponential with factor 2
let buckets = prometheus::exponential_buckets(0.0001, 2.0, 18)
.expect("exponential buckets should not fail");
let make = |suffix: &str, help: &str| {
let name = format!("{}_{}", name_prefix::ROUTING_OVERHEAD, suffix);
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(name, help).buckets(buckets.clone()),
)
.expect("histogram creation should not fail")
};
RoutingOverheadMetrics {
block_hashing: make(
routing_overhead::BLOCK_HASHING_MS,
"Time spent computing block hashes in milliseconds",
),
indexer_find_matches: make(
routing_overhead::INDEXER_FIND_MATCHES_MS,
"Time spent in indexer find_matches in milliseconds",
),
seq_hashing: make(
routing_overhead::SEQ_HASHING_MS,
"Time spent computing sequence hashes in milliseconds",
),
scheduling: make(
routing_overhead::SCHEDULING_MS,
"Time spent in scheduler worker selection in milliseconds",
),
total: make(
routing_overhead::TOTAL_MS,
"Total routing overhead per request in milliseconds",
),
}
});
impl RoutingOverheadMetrics {
/// Observe routing overhead timings in milliseconds.
pub fn observe(
&self,
hash_elapsed: Duration,
find_matches_elapsed: Duration,
seq_hash_elapsed: Duration,
total_elapsed: Duration,
) {
self.block_hashing
.observe(hash_elapsed.as_secs_f64() * 1000.0);
self.indexer_find_matches.observe(
find_matches_elapsed
.saturating_sub(hash_elapsed)
.as_secs_f64()
* 1000.0,
);
self.seq_hashing.observe(
seq_hash_elapsed
.saturating_sub(find_matches_elapsed)
.as_secs_f64()
* 1000.0,
);
self.scheduling
.observe(total_elapsed.saturating_sub(seq_hash_elapsed).as_secs_f64() * 1000.0);
self.total.observe(total_elapsed.as_secs_f64() * 1000.0);
}
}
/// Register the routing overhead histograms with the given Prometheus registry.
pub fn register_routing_overhead_metrics(
registry: &prometheus::Registry,
) -> Result<(), prometheus::Error> {
let m = &*ROUTING_OVERHEAD_METRICS;
registry.register(Box::new(m.block_hashing.clone()))?;
registry.register(Box::new(m.indexer_find_matches.clone()))?;
registry.register(Box::new(m.seq_hashing.clone()))?;
registry.register(Box::new(m.scheduling.clone()))?;
registry.register(Box::new(m.total.clone()))?;
Ok(())
}
// ---------------------------------------------------------------------------
// Router request metrics (component-scoped aggregate histograms + counter)
// ---------------------------------------------------------------------------
/// Aggregate per-request metrics observed at the router level.
/// Component-scoped via `from_component()` to get automatic namespace/component labels.
/// Uses `router_` prefix to distinguish from `ResponseMetricCollector` metrics.
pub struct RouterRequestMetrics {
pub requests_total: prometheus::IntCounter,
pub time_to_first_token_seconds: prometheus::Histogram,
pub inter_token_latency_seconds: prometheus::Histogram,
pub input_sequence_tokens: prometheus::Histogram,
pub output_sequence_tokens: prometheus::Histogram,
}
static ROUTER_REQUEST_METRICS: OnceLock<Arc<RouterRequestMetrics>> = OnceLock::new();
impl RouterRequestMetrics {
fn new(
requests_total: prometheus::IntCounter,
time_to_first_token_seconds: prometheus::Histogram,
inter_token_latency_seconds: prometheus::Histogram,
input_sequence_tokens: prometheus::Histogram,
output_sequence_tokens: prometheus::Histogram,
) -> Self {
Self {
requests_total,
time_to_first_token_seconds,
inter_token_latency_seconds,
input_sequence_tokens,
output_sequence_tokens,
}
}
/// Create from a Component, memoized in a static OnceLock.
pub fn from_component(component: &Component) -> Arc<Self> {
ROUTER_REQUEST_METRICS
.get_or_init(|| {
let metrics = component.metrics();
let requests_total = metrics
.create_intcounter(
"router_requests_total",
"Total number of requests processed by the router",
&[],
)
.expect("failed to create router_requests_total");
let time_to_first_token_seconds = metrics
.create_histogram(
"router_time_to_first_token_seconds",
"Time to first token observed at the router",
&[],
Some(generate_log_buckets(0.001, 480.0, 18)),
)
.expect("failed to create router_time_to_first_token_seconds");
let inter_token_latency_seconds = metrics
.create_histogram(
"router_inter_token_latency_seconds",
"Average inter-token latency observed at the router",
&[],
Some(generate_log_buckets(0.001, 2.0, 13)),
)
.expect("failed to create router_inter_token_latency_seconds");
let input_sequence_tokens = metrics
.create_histogram(
"router_input_sequence_tokens",
"Input sequence length in tokens observed at the router",
&[],
Some(generate_log_buckets(50.0, 128000.0, 12)),
)
.expect("failed to create router_input_sequence_tokens");
let output_sequence_tokens = metrics
.create_histogram(
"router_output_sequence_tokens",
"Output sequence length in tokens observed at the router",
&[],
Some(generate_log_buckets(50.0, 32000.0, 10)),
)
.expect("failed to create router_output_sequence_tokens");
Arc::new(Self::new(
requests_total,
time_to_first_token_seconds,
inter_token_latency_seconds,
input_sequence_tokens,
output_sequence_tokens,
))
})
.clone()
}
}
#[cfg(test)]
mod tests {
use super::*;
use prometheus::{Encoder, TextEncoder};
fn gather_pef(registry: &prometheus::Registry) -> String {
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
encoder.encode(&registry.gather(), &mut buffer).unwrap();
String::from_utf8(buffer).unwrap()
}
#[test]
fn test_worker_load_metrics_pef() {
let registry = prometheus::Registry::new();
let metrics = WorkerLoadMetrics {
active_decode_blocks: IntGaugeVec::new(
Opts::new(
format!(
"{}_{}",
name_prefix::FRONTEND,
frontend_service::WORKER_ACTIVE_DECODE_BLOCKS
),
"Active KV cache decode blocks per worker",
),
&[labels::WORKER_ID, labels::DP_RANK, labels::WORKER_TYPE],
)
.unwrap(),
active_prefill_tokens: IntGaugeVec::new(
Opts::new(
format!(
"{}_{}",
name_prefix::FRONTEND,
frontend_service::WORKER_ACTIVE_PREFILL_TOKENS
),
"Active prefill tokens queued per worker",
),
&[labels::WORKER_ID, labels::DP_RANK, labels::WORKER_TYPE],
)
.unwrap(),
};
registry
.register(Box::new(metrics.active_decode_blocks.clone()))
.unwrap();
registry
.register(Box::new(metrics.active_prefill_tokens.clone()))
.unwrap();
metrics.observe(123, 0, "decode", 42, 100);
let output = gather_pef(&registry);
let expected = "\
# HELP dynamo_frontend_worker_active_decode_blocks Active KV cache decode blocks per worker
# TYPE dynamo_frontend_worker_active_decode_blocks gauge
dynamo_frontend_worker_active_decode_blocks{dp_rank=\"0\",worker_id=\"123\",worker_type=\"decode\"} 42
# HELP dynamo_frontend_worker_active_prefill_tokens Active prefill tokens queued per worker
# TYPE dynamo_frontend_worker_active_prefill_tokens gauge
dynamo_frontend_worker_active_prefill_tokens{dp_rank=\"0\",worker_id=\"123\",worker_type=\"decode\"} 100
";
assert_eq!(
output, expected,
"\nActual PEF:\n{output}\nExpected PEF:\n{expected}"
);
}
#[test]
fn test_routing_overhead_metric_names_pef() {
let registry = prometheus::Registry::new();
let buckets = prometheus::exponential_buckets(0.0001, 2.0, 18).unwrap();
let make = |suffix: &str, help: &str| {
let name = format!("{}_{}", name_prefix::ROUTING_OVERHEAD, suffix);
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(name, help).buckets(buckets.clone()),
)
.unwrap()
};
let total = make(
routing_overhead::TOTAL_MS,
"Total routing overhead per request in milliseconds",
);
registry.register(Box::new(total.clone())).unwrap();
total.observe(1.5);
let output = gather_pef(&registry);
assert!(
output.contains("# HELP dynamo_routing_overhead_total_ms"),
"PEF missing HELP for routing overhead metric"
);
assert!(
output.contains("# TYPE dynamo_routing_overhead_total_ms histogram"),
"PEF missing TYPE for routing overhead metric"
);
assert!(
output.contains("dynamo_routing_overhead_total_ms_count 1"),
"PEF missing observation count"
);
}
#[test]
fn test_routing_overhead_saturating_sub() {
let buckets = prometheus::exponential_buckets(0.0001, 2.0, 18).unwrap();
let make = |name: &str| {
prometheus::Histogram::with_opts(
prometheus::HistogramOpts::new(name, "test").buckets(buckets.clone()),
)
.unwrap()
};
let metrics = RoutingOverheadMetrics {
block_hashing: make("test_block_hashing_ms"),
indexer_find_matches: make("test_find_matches_ms"),
seq_hashing: make("test_seq_hashing_ms"),
scheduling: make("test_scheduling_ms"),
total: make("test_total_ms"),
};
// Out-of-order durations: each phase < previous (would panic without saturating_sub)
metrics.observe(
Duration::from_millis(10),
Duration::from_millis(5),
Duration::from_millis(3),
Duration::from_millis(1),
);
// Reaching here without panic confirms saturating_sub works
}
}
...@@ -348,7 +348,7 @@ impl PrefillRouter { ...@@ -348,7 +348,7 @@ impl PrefillRouter {
/// ///
/// If `phase_permit` is provided, it is dropped after the first output is received, /// If `phase_permit` is provided, it is dropped after the first output is received,
/// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap /// allowing subsequent `set_phase` calls to proceed. This is used in the bootstrap
/// optimization path to ensure `record_worker` completes before the phase changes. /// optimization path to ensure `record_worker_full` completes before the phase changes.
/// ///
/// Returns (PrefillResult, Option<(worker_id, dp_rank)>). /// Returns (PrefillResult, Option<(worker_id, dp_rank)>).
async fn execute_prefill( async fn execute_prefill(
...@@ -363,7 +363,7 @@ impl PrefillRouter { ...@@ -363,7 +363,7 @@ impl PrefillRouter {
.await .await
.map_err(|e| PrefillError::PrefillError(e.to_string()))?; .map_err(|e| PrefillError::PrefillError(e.to_string()))?;
// Drop phase permit now - routing is complete, record_worker was called in select_worker. // Drop phase permit now - routing is complete, record_worker_full was called in select_worker.
// This unblocks set_phase(Decode) in the main task without waiting for prefill output. // This unblocks set_phase(Decode) in the main task without waiting for prefill output.
drop(phase_permit); drop(phase_permit);
...@@ -530,7 +530,6 @@ impl ...@@ -530,7 +530,6 @@ impl
} }
let tracker = req.tracker.as_ref().unwrap(); let tracker = req.tracker.as_ref().unwrap();
let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await; let prefill_phase_permit = tracker.set_phase(RequestPhase::Prefill).await;
tracker.record_prefill_start();
// Prepare prefill request with max_tokens = 1 (clone after tracker is set) // Prepare prefill request with max_tokens = 1 (clone after tracker is set)
let mut prefill_req = req.clone(); let mut prefill_req = req.clone();
...@@ -557,14 +556,6 @@ impl ...@@ -557,14 +556,6 @@ impl
router.select_next_worker(); router.select_next_worker();
} }
// Record prefill worker on the main request's tracker for metrics.
// (The cloned prefill_req has its own tracker, so we need to record here)
// Worker type is stored at routing time to avoid expensive MDC lookups when
// updating Prometheus TTFT metrics later in the response stream.
if let Some(ref tracker) = req.tracker {
tracker.record_prefill_worker_full(worker_id, dp_rank, WORKER_TYPE_PREFILL);
}
let routing = prefill_req.routing_mut(); let routing = prefill_req.routing_mut();
routing.prefill_worker_id = Some(worker_id); routing.prefill_worker_id = Some(worker_id);
routing.dp_rank = Some(dp_rank); routing.dp_rank = Some(dp_rank);
...@@ -573,7 +564,7 @@ impl ...@@ -573,7 +564,7 @@ impl
let prefill_context = Context::with_id(prefill_req, request_id.clone()); let prefill_context = Context::with_id(prefill_req, request_id.clone());
engine_ctx.link_child(prefill_context.context()); engine_ctx.link_child(prefill_context.context());
// Pass phase permit to spawned task - it drops after first output (record_worker complete) // Pass phase permit to spawned task - it drops after first output (record_worker_full complete)
// This allows set_phase(Decode) below to proceed only after prefill routing is done // This allows set_phase(Decode) below to proceed only after prefill routing is done
self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit); self.spawn_prefill_task(prefill_context, Some(worker_id), prefill_phase_permit);
...@@ -591,16 +582,6 @@ impl ...@@ -591,16 +582,6 @@ impl
let result = self.call_prefill(prefill_context).await; let result = self.call_prefill(prefill_context).await;
// Record prefill worker on the main request's tracker for metrics.
// (call_prefill returns the worker_id and dp_rank from the prefill routing)
// Worker type is stored at routing time to avoid expensive MDC lookups when
// updating Prometheus TTFT metrics later in the response stream.
if let Ok((_, Some((worker_id, dp_rank)))) = &result
&& let Some(ref tracker) = req.tracker
{
tracker.record_prefill_worker_full(*worker_id, *dp_rank, WORKER_TYPE_PREFILL);
}
result.map(|(result, worker_info)| { result.map(|(result, worker_info)| {
(Some(result), worker_info.map(|(id, _)| id), None) (Some(result), worker_info.map(|(id, _)| id), None)
}) })
...@@ -625,7 +606,7 @@ impl ...@@ -625,7 +606,7 @@ impl
// Set phase to Decode for the decode request. // Set phase to Decode for the decode request.
// In bootstrap path, this blocks until the spawned prefill task drops its permit // In bootstrap path, this blocks until the spawned prefill task drops its permit
// (after first output / record_worker completes), ensuring correct phase for routing. // (after first output / record_worker_full completes), ensuring correct phase for routing.
if let Some(ref tracker) = req.tracker { if let Some(ref tracker) = req.tracker {
let _decode_permit = tracker.set_phase(RequestPhase::Decode).await; let _decode_permit = tracker.set_phase(RequestPhase::Decode).await;
// Permit is dropped immediately - decode proceeds, no need to hold it // Permit is dropped immediately - decode proceeds, no need to hold it
......
...@@ -17,6 +17,7 @@ use serde_json::json; ...@@ -17,6 +17,7 @@ use serde_json::json;
use crate::{ use crate::{
kv_router::{ kv_router::{
KvRouter, KvRouter,
metrics::RouterRequestMetrics,
protocols::{TokensWithHashes, WorkerWithDpRank}, protocols::{TokensWithHashes, WorkerWithDpRank},
}, },
preprocessor::PreprocessedRequest, preprocessor::PreprocessedRequest,
...@@ -219,20 +220,25 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -219,20 +220,25 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
} }
} }
// Record metrics in tracker: KV hit rate, worker ID, and worker type based on phase. // Record routing metrics on tracker and observe ISL + prefill start.
// Worker type is stored at routing time to avoid expensive MDC lookups when let request_metrics =
// updating Prometheus metrics (TTFT/ITL) later in the response stream. RouterRequestMetrics::from_component(self.chooser.client().endpoint.component());
if let Some(ref tracker) = request.tracker { if let Some(ref tracker) = request.tracker {
let isl_blocks = request.token_ids.len().div_ceil(block_size); let isl_blocks = request.token_ids.len().div_ceil(block_size);
tracker.record_kv_hit(overlap_amount, isl_blocks); tracker.record_kv_hit(overlap_amount, isl_blocks);
tracker.record_isl(
request.token_ids.len(),
overlap_amount as usize * block_size,
);
tracker.record_worker_full(instance_id, dp_rank, self.chooser.worker_type()); tracker.record_worker_full(instance_id, dp_rank, self.chooser.worker_type());
} }
request_metrics
.input_sequence_tokens
.observe(request.token_ids.len() as f64);
// Handle query-only requests: early return with worker info // Handle query-only requests: early return with worker info
if is_query_only { if is_query_only {
let stream_context = request.context().clone(); let stream_context = request.context().clone();
// Tracker is always created for query-only requests (delta generator enables tracking
// when query_instance_id annotation is present)
let worker_id_info = request.tracker.as_ref().and_then(|t| t.get_worker_info()); let worker_id_info = request.tracker.as_ref().and_then(|t| t.get_worker_info());
tracing::trace!( tracing::trace!(
...@@ -262,11 +268,17 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -262,11 +268,17 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
.and_then(|r| r.expected_output_tokens); .and_then(|r| r.expected_output_tokens);
let track_output_blocks = let track_output_blocks =
self.chooser.kv_router_config().router_track_output_blocks && handle_local_updates; self.chooser.kv_router_config().router_track_output_blocks && handle_local_updates;
let tracker = request.tracker.clone();
let (mut backend_input, context) = request.into_parts(); let (mut backend_input, context) = request.into_parts();
backend_input.routing_mut().dp_rank = Some(dp_rank); backend_input.routing_mut().dp_rank = Some(dp_rank);
let updated_request = context.map(|_| backend_input); let updated_request = context.map(|_| backend_input);
// Record prefill start right before pushing to backend (OnceLock: first call wins).
if let Some(ref tracker) = tracker {
tracker.record_prefill_start();
}
let chooser = self.chooser.clone(); let chooser = self.chooser.clone();
let mut response_stream = self.inner.direct(updated_request, instance_id).await?; let mut response_stream = self.inner.direct(updated_request, instance_id).await?;
let stream_context = response_stream.context(); let stream_context = response_stream.context();
...@@ -277,6 +289,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -277,6 +289,7 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
// When false, an external caller (e.g., GAIE sidecar) handles bookkeeping via C FFI. // When false, an external caller (e.g., GAIE sidecar) handles bookkeeping via C FFI.
let wrapped_stream = Box::pin(async_stream::stream! { let wrapped_stream = Box::pin(async_stream::stream! {
let mut prefill_marked = false; let mut prefill_marked = false;
let mut first_token_recorded = false;
// Output block tracking state // Output block tracking state
let mut cumulative_osl: usize = 0; let mut cumulative_osl: usize = 0;
...@@ -310,13 +323,27 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -310,13 +323,27 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
} }
} }
let new_tokens = item.data.as_ref()
.map(|d| d.token_ids.len())
.unwrap_or(0);
// Record first token time on tracker when actual tokens arrive
if !first_token_recorded && new_tokens > 0 {
if let Some(ref tracker) = tracker {
tracker.record_first_token();
if let Some(ttft) = tracker.ttft_ms() {
request_metrics
.time_to_first_token_seconds
.observe(ttft / 1000.0);
}
}
first_token_recorded = true;
}
cumulative_osl += new_tokens;
// Track output blocks if enabled // Track output blocks if enabled
if track_output_blocks { if track_output_blocks {
let new_tokens = item.data.as_ref()
.map(|d| d.token_ids.len())
.unwrap_or(0);
cumulative_osl += new_tokens;
let new_total_blocks = (isl_tokens + cumulative_osl).div_ceil(block_size); let new_total_blocks = (isl_tokens + cumulative_osl).div_ceil(block_size);
if new_total_blocks > current_total_blocks { if new_total_blocks > current_total_blocks {
// New block boundary crossed - add output block with decay // New block boundary crossed - add output block with decay
...@@ -329,6 +356,18 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -329,6 +356,18 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
"Failed to add output block for request {context_id}: {e}" "Failed to add output block for request {context_id}: {e}"
); );
} }
// Update tracker and observe avg ITL at each block boundary
if let Some(ref tracker) = tracker {
tracker.record_osl(cumulative_osl);
tracker.record_finish();
if let Some(avg_itl) = tracker.avg_itl_ms() {
request_metrics
.inter_token_latency_seconds
.observe(avg_itl / 1000.0);
}
}
current_total_blocks = new_total_blocks; current_total_blocks = new_total_blocks;
} }
} }
...@@ -338,6 +377,17 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu ...@@ -338,6 +377,17 @@ impl AsyncEngine<SingleIn<PreprocessedRequest>, ManyOut<Annotated<LLMEngineOutpu
} }
} }
// Record final aggregate metrics (histograms sampled once per request)
if let Some(ref tracker) = tracker {
tracker.record_finish();
tracker.record_osl(cumulative_osl);
request_metrics
.output_sequence_tokens
.observe(cumulative_osl as f64);
}
request_metrics.requests_total.inc();
// Only call free() if we handle local updates. // Only call free() if we handle local updates.
// When handle_local_updates=false, external caller handles cleanup via C FFI. // When handle_local_updates=false, external caller handles cleanup via C FFI.
if handle_local_updates if handle_local_updates
......
...@@ -6,7 +6,6 @@ use crate::local_model::runtime_config::ModelRuntimeConfig; ...@@ -6,7 +6,6 @@ use crate::local_model::runtime_config::ModelRuntimeConfig;
use anyhow::Result; use anyhow::Result;
use dynamo_runtime::component::Component; use dynamo_runtime::component::Component;
use dynamo_runtime::traits::DistributedRuntimeProvider; use dynamo_runtime::traits::DistributedRuntimeProvider;
use dynamo_runtime::transports::event_plane::EventPublisher;
use rand::Rng; use rand::Rng;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
...@@ -15,7 +14,6 @@ use std::time::Duration; ...@@ -15,7 +14,6 @@ use std::time::Duration;
#[cfg(feature = "bench")] #[cfg(feature = "bench")]
use std::time::Instant; use std::time::Instant;
use super::KV_HIT_RATE_SUBJECT;
use super::KvRouterConfig; use super::KvRouterConfig;
use super::RouterConfigOverride; use super::RouterConfigOverride;
use super::WorkerSelector; use super::WorkerSelector;
...@@ -24,15 +22,6 @@ use super::sequence::{ActiveSequencesMultiWorker, SequenceError}; ...@@ -24,15 +22,6 @@ use super::sequence::{ActiveSequencesMultiWorker, SequenceError};
use dynamo_tokens::SequenceHash; use dynamo_tokens::SequenceHash;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KVHitRateEvent {
pub worker_id: WorkerId,
#[serde(default)]
pub dp_rank: DpRank,
pub isl_blocks: usize,
pub overlap_blocks: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PotentialLoad { pub struct PotentialLoad {
pub worker_id: WorkerId, pub worker_id: WorkerId,
...@@ -160,10 +149,6 @@ impl KvScheduler { ...@@ -160,10 +149,6 @@ impl KvScheduler {
let scheduler_rx = workers_with_configs.clone(); let scheduler_rx = workers_with_configs.clone();
let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024); let (request_tx, request_rx) = tokio::sync::mpsc::channel::<SchedulingRequest>(1024);
let scheduler_cancel_token = component.drt().primary_token(); let scheduler_cancel_token = component.drt().primary_token();
let hit_rate_publisher =
EventPublisher::for_namespace(component.namespace(), KV_HIT_RATE_SUBJECT)
.await
.map_err(|e| KvSchedulerError::InitFailed(e.to_string()))?;
// Background task to handle scheduling requests // Background task to handle scheduling requests
tokio::spawn(async move { tokio::spawn(async move {
...@@ -199,16 +184,6 @@ impl KvScheduler { ...@@ -199,16 +184,6 @@ impl KvScheduler {
match selector.select_worker(&workers, &request, block_size) { match selector.select_worker(&workers, &request, block_size) {
Ok(selection) => { Ok(selection) => {
let event = KVHitRateEvent {
worker_id: selection.worker.worker_id,
dp_rank: selection.worker.dp_rank,
isl_blocks: selection.required_blocks as usize,
overlap_blocks: selection.overlap_blocks,
};
if let Err(e) = hit_rate_publisher.publish(&event).await {
tracing::warn!("Failed to publish KV hit rate event: {:?}", e);
}
let response = SchedulingResponse { let response = SchedulingResponse {
best_worker: selection.worker, best_worker: selection.worker,
overlap_blocks: selection.overlap_blocks, overlap_blocks: selection.overlap_blocks,
......
...@@ -37,10 +37,10 @@ use std::time::Duration; ...@@ -37,10 +37,10 @@ use std::time::Duration;
use tokio::time::Instant; use tokio::time::Instant;
use uuid::Uuid; use uuid::Uuid;
use super::metrics::WORKER_LOAD_METRICS;
use super::protocols::{ use super::protocols::{
ActiveLoad, ActiveSequenceEvent, ActiveSequenceEventData, WorkerWithDpRank, ActiveLoad, ActiveSequenceEvent, ActiveSequenceEventData, WorkerWithDpRank,
}; };
use crate::discovery::{WORKER_ACTIVE_DECODE_BLOCKS_GAUGE, WORKER_ACTIVE_PREFILL_TOKENS_GAUGE};
use crate::kv_router::{ACTIVE_SEQUENCES_SUBJECT, KV_METRICS_SUBJECT}; use crate::kv_router::{ACTIVE_SEQUENCES_SUBJECT, KV_METRICS_SUBJECT};
use crate::local_model::runtime_config::ModelRuntimeConfig; use crate::local_model::runtime_config::ModelRuntimeConfig;
use dynamo_runtime::CancellationToken; use dynamo_runtime::CancellationToken;
...@@ -1043,22 +1043,13 @@ impl ActiveSequencesMultiWorker { ...@@ -1043,22 +1043,13 @@ impl ActiveSequencesMultiWorker {
}; };
// Update Prometheus gauges directly (router's own bookkeeping) // Update Prometheus gauges directly (router's own bookkeeping)
let worker_id_str = worker.worker_id.to_string(); WORKER_LOAD_METRICS.observe(
let dp_rank_str = worker.dp_rank.to_string(); worker.worker_id,
WORKER_ACTIVE_DECODE_BLOCKS_GAUGE worker.dp_rank,
.with_label_values(&[ self.worker_type,
worker_id_str.as_str(), active_blocks,
dp_rank_str.as_str(), active_tokens,
self.worker_type, );
])
.set(active_blocks as i64);
WORKER_ACTIVE_PREFILL_TOKENS_GAUGE
.with_label_values(&[
worker_id_str.as_str(),
dp_rank_str.as_str(),
self.worker_type,
])
.set(active_tokens as i64);
// Also publish ActiveLoad to NATS for other subscribers (if NATS is available) // Also publish ActiveLoad to NATS for other subscribers (if NATS is available)
let active_load = ActiveLoad { let active_load = ActiveLoad {
......
...@@ -6,20 +6,20 @@ ...@@ -6,20 +6,20 @@
//! This module provides [`RequestTracker`] for tracking timing and routing information //! This module provides [`RequestTracker`] for tracking timing and routing information
//! that can be returned to clients via the `nvext` response field. //! that can be returned to clients via the `nvext` response field.
use std::time::{Instant, SystemTime, UNIX_EPOCH}; use std::sync::Arc;
use std::{ use std::sync::OnceLock;
sync::{ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
Arc, OnceLock, use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
atomic::{AtomicU32, AtomicU64, Ordering},
},
time::Duration,
};
use parking_lot::Mutex; use parking_lot::Mutex;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tokio::sync::{OwnedSemaphorePermit, Semaphore};
use utoipa::ToSchema; use utoipa::ToSchema;
use crate::http::service::metrics::{
WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE, WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE,
WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE,
};
use crate::protocols::openai::nvext::WorkerIdInfo; use crate::protocols::openai::nvext::WorkerIdInfo;
/// Sentinel value indicating no worker ID has been set. /// Sentinel value indicating no worker ID has been set.
...@@ -62,18 +62,27 @@ impl std::fmt::Display for RequestPhase { ...@@ -62,18 +62,27 @@ impl std::fmt::Display for RequestPhase {
/// Captures information throughout the request lifecycle: /// Captures information throughout the request lifecycle:
/// - `request_received`: When the request was received /// - `request_received`: When the request was received
/// - `prefill_start_time`: When prefill started (for disaggregated serving) /// - `prefill_start_time`: When prefill started (for disaggregated serving)
/// - `first_token_time`: When the first token was generated (set once via OnceLock) /// - `first_token_time`: When the first token was generated
/// - `request_finish_time`: When the request finished (set once via OnceLock) /// - `request_finish_time`: When the last token was generated (updated incrementally)
/// - KV cache hit rate information /// - KV cache hit rate information
/// - Worker IDs and types for per-worker Prometheus metrics /// - Worker IDs and types for per-worker Prometheus metrics
/// ///
/// The `OnceLock` fields ensure that values are set exactly once, /// ## Concurrency primitives
/// which is important for disaggregated serving where the "first token" ///
/// might appear multiple times. /// **`OnceLock` (first-write-wins):** Used for values that must capture the earliest
/// observation and ignore later writes. In disaggregated serving, both prefill and decode
/// phases may call `record_first_token`; `OnceLock` ensures the prefill phase's TTFT is
/// preserved. Also used for one-shot metadata: `prefill_start_time`, KV hit info,
/// ISL/cached tokens, worker types, and tokenizer latency.
/// ///
/// Worker IDs use `AtomicU64` instead of `OnceLock<u64>` for lower overhead since /// **`Mutex` (last-write-wins):** Used for values where later phases should overwrite
/// the tracker is created for every request. The sentinel value `NO_WORKER_ID` (0) /// earlier ones. `request_finish_time` is updated incrementally at each output block
/// indicates no worker has been recorded yet. /// boundary so that `avg_itl_ms()` stays current during streaming, and the decode
/// phase's final finish naturally overwrites the prefill phase's earlier finish.
/// `phase` also uses a Mutex since it transitions across phases.
///
/// **`AtomicU64`/`AtomicU32`:** Used for frequently updated counters (`osl_tokens`)
/// and worker IDs/ranks where `OnceLock`'s heap overhead is unnecessary.
#[derive(Debug)] #[derive(Debug)]
pub struct RequestTracker { pub struct RequestTracker {
/// When the request was received (monotonic clock for duration calculations) /// When the request was received (monotonic clock for duration calculations)
...@@ -85,11 +94,14 @@ pub struct RequestTracker { ...@@ -85,11 +94,14 @@ pub struct RequestTracker {
/// When prefill started (for disaggregated serving) - set once via OnceLock /// When prefill started (for disaggregated serving) - set once via OnceLock
prefill_start_time: OnceLock<Instant>, prefill_start_time: OnceLock<Instant>,
/// When the first token was generated - set once via OnceLock /// When the first token was generated (set once via OnceLock).
/// In disaggregated serving, the prefill phase records this first and the
/// decode phase's attempt is silently ignored, preserving the real TTFT.
first_token_time: OnceLock<Instant>, first_token_time: OnceLock<Instant>,
/// When the request finished - set once via OnceLock /// When the request finished. Mutex allows the last router phase to
request_finish_time: OnceLock<Instant>, /// record the final finish time.
request_finish_time: Mutex<Option<Instant>>,
/// KV cache overlap blocks (prefix cache hits) - set once via OnceLock /// KV cache overlap blocks (prefix cache hits) - set once via OnceLock
kv_overlap_blocks: OnceLock<u32>, kv_overlap_blocks: OnceLock<u32>,
...@@ -97,6 +109,15 @@ pub struct RequestTracker { ...@@ -97,6 +109,15 @@ pub struct RequestTracker {
/// Input sequence length in blocks (for hit rate calculation) - set once via OnceLock /// Input sequence length in blocks (for hit rate calculation) - set once via OnceLock
isl_blocks: OnceLock<usize>, isl_blocks: OnceLock<usize>,
/// Input sequence length in tokens - set once via OnceLock
isl_tokens: OnceLock<usize>,
/// Number of cached tokens (overlap_blocks * block_size) - set once via OnceLock
cached_tokens: OnceLock<usize>,
/// Output sequence length in tokens - updated atomically as tokens stream back
osl_tokens: AtomicU64,
/// Prefill worker ID (for disaggregated serving). /// Prefill worker ID (for disaggregated serving).
/// Uses atomic with compare-exchange for set-once semantics. /// Uses atomic with compare-exchange for set-once semantics.
/// Value of 0 (NO_WORKER_ID) means not yet set. /// Value of 0 (NO_WORKER_ID) means not yet set.
...@@ -128,7 +149,7 @@ pub struct RequestTracker { ...@@ -128,7 +149,7 @@ pub struct RequestTracker {
/// Semaphore for coordinating phase transitions. /// Semaphore for coordinating phase transitions.
/// Acquiring a permit blocks subsequent set_phase calls until the permit is dropped. /// Acquiring a permit blocks subsequent set_phase calls until the permit is dropped.
/// This prevents race conditions in the bootstrap optimization path where prefill /// This prevents race conditions in the bootstrap optimization path where prefill
/// runs in background and needs to complete record_worker before phase changes. /// runs in background and needs to complete record_worker_full before phase changes.
phase_semaphore: Arc<Semaphore>, phase_semaphore: Arc<Semaphore>,
/// How long it took to tokenize the input /// How long it took to tokenize the input
...@@ -149,9 +170,12 @@ impl RequestTracker { ...@@ -149,9 +170,12 @@ impl RequestTracker {
request_received_epoch_ms: epoch_ms, request_received_epoch_ms: epoch_ms,
prefill_start_time: OnceLock::new(), prefill_start_time: OnceLock::new(),
first_token_time: OnceLock::new(), first_token_time: OnceLock::new(),
request_finish_time: OnceLock::new(), request_finish_time: Mutex::new(None),
kv_overlap_blocks: OnceLock::new(), kv_overlap_blocks: OnceLock::new(),
isl_blocks: OnceLock::new(), isl_blocks: OnceLock::new(),
isl_tokens: OnceLock::new(),
cached_tokens: OnceLock::new(),
osl_tokens: AtomicU64::new(0),
prefill_worker_id: AtomicU64::new(NO_WORKER_ID), prefill_worker_id: AtomicU64::new(NO_WORKER_ID),
prefill_dp_rank: AtomicU32::new(NO_DP_RANK), prefill_dp_rank: AtomicU32::new(NO_DP_RANK),
decode_worker_id: AtomicU64::new(NO_WORKER_ID), decode_worker_id: AtomicU64::new(NO_WORKER_ID),
...@@ -169,12 +193,12 @@ impl RequestTracker { ...@@ -169,12 +193,12 @@ impl RequestTracker {
self.prefill_start_time.set(Instant::now()).is_ok() self.prefill_start_time.set(Instant::now()).is_ok()
} }
pub fn record_first_token(&self) -> bool { pub fn record_first_token(&self) {
self.first_token_time.set(Instant::now()).is_ok() let _ = self.first_token_time.set(Instant::now());
} }
pub fn record_finish(&self) -> bool { pub fn record_finish(&self) {
self.request_finish_time.set(Instant::now()).is_ok() *self.request_finish_time.lock() = Some(Instant::now());
} }
/// Record KV cache hit information. Returns true if this was the first call. /// Record KV cache hit information. Returns true if this was the first call.
...@@ -184,6 +208,29 @@ impl RequestTracker { ...@@ -184,6 +208,29 @@ impl RequestTracker {
overlap_set && isl_set overlap_set && isl_set
} }
/// Record input sequence length in tokens and cached token count.
pub fn record_isl(&self, isl_tokens: usize, cached_tokens: usize) {
let _ = self.isl_tokens.set(isl_tokens);
let _ = self.cached_tokens.set(cached_tokens);
}
pub fn isl_tokens(&self) -> Option<usize> {
self.isl_tokens.get().copied()
}
pub fn cached_tokens(&self) -> Option<usize> {
self.cached_tokens.get().copied()
}
/// Record current output sequence length in tokens. Updated at each output block boundary.
pub fn record_osl(&self, osl: usize) {
self.osl_tokens.store(osl as u64, Ordering::Relaxed);
}
pub fn osl_tokens(&self) -> u64 {
self.osl_tokens.load(Ordering::Relaxed)
}
/// Time from request received to prefill start (queue/wait time) in milliseconds. /// Time from request received to prefill start (queue/wait time) in milliseconds.
pub fn prefill_wait_time_ms(&self) -> Option<f64> { pub fn prefill_wait_time_ms(&self) -> Option<f64> {
self.prefill_start_time self.prefill_start_time
...@@ -199,15 +246,32 @@ impl RequestTracker { ...@@ -199,15 +246,32 @@ impl RequestTracker {
} }
pub fn ttft_ms(&self) -> Option<f64> { pub fn ttft_ms(&self) -> Option<f64> {
self.first_token_time let first_token = self.first_token_time.get()?;
.get() Some(
.map(|t| t.duration_since(self.request_received).as_secs_f64() * 1000.0) first_token
.duration_since(self.request_received)
.as_secs_f64()
* 1000.0,
)
} }
pub fn total_time_ms(&self) -> Option<f64> { pub fn total_time_ms(&self) -> Option<f64> {
self.request_finish_time let finish = (*self.request_finish_time.lock())?;
.get() Some(finish.duration_since(self.request_received).as_secs_f64() * 1000.0)
.map(|t| t.duration_since(self.request_received).as_secs_f64() * 1000.0) }
/// Average inter-token latency in milliseconds.
/// Computed as (finish_time - first_token_time) / (osl - 1).
/// Returns None if fewer than 2 output tokens or times not recorded.
pub fn avg_itl_ms(&self) -> Option<f64> {
let first_token = *self.first_token_time.get()?;
let finish = (*self.request_finish_time.lock())?;
let osl = self.osl_tokens.load(Ordering::Relaxed);
if osl < 2 {
return None;
}
let decode_duration = finish.duration_since(first_token).as_secs_f64() * 1000.0;
Some(decode_duration / (osl - 1) as f64)
} }
pub fn request_received_epoch_ms(&self) -> u64 { pub fn request_received_epoch_ms(&self) -> u64 {
...@@ -224,95 +288,11 @@ impl RequestTracker { ...@@ -224,95 +288,11 @@ impl RequestTracker {
Some(overlap as f64 / isl as f64) Some(overlap as f64 / isl as f64)
} }
/// Record the prefill worker ID. Returns true if this was the first call.
pub fn record_prefill_worker(&self, id: u64) -> bool {
self.prefill_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok()
}
/// Record the prefill worker ID and DP rank. Returns true if worker_id was recorded for the first time.
/// Only sets the dp_rank if the worker_id is newly set to avoid mismatched worker_id/dp_rank pairs.
pub fn record_prefill_worker_with_rank(&self, id: u64, dp_rank: u32) -> bool {
let is_new = self
.prefill_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok();
if is_new {
self.prefill_dp_rank.store(dp_rank, Ordering::SeqCst);
}
is_new
}
/// Record the prefill worker ID, DP rank, and worker type.
/// The worker_type is stored to avoid MDC lookup when updating Prometheus metrics.
/// Returns true if worker_id was recorded for the first time.
pub fn record_prefill_worker_full(
&self,
id: u64,
dp_rank: u32,
worker_type: &'static str,
) -> bool {
let is_new = self
.prefill_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok();
if is_new {
self.prefill_dp_rank.store(dp_rank, Ordering::SeqCst);
let _ = self.prefill_worker_type.set(worker_type);
}
is_new
}
/// Record the decode worker ID. Returns true if this was the first call.
pub fn record_decode_worker(&self, id: u64) -> bool {
self.decode_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok()
}
/// Record the decode worker ID and DP rank. Returns true if worker_id was recorded for the first time.
/// Only sets the dp_rank if the worker_id is newly set to avoid mismatched worker_id/dp_rank pairs.
pub fn record_decode_worker_with_rank(&self, id: u64, dp_rank: u32) -> bool {
let is_new = self
.decode_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok();
if is_new {
self.decode_dp_rank.store(dp_rank, Ordering::SeqCst);
}
is_new
}
/// Record the decode worker ID, DP rank, and worker type.
/// The worker_type is stored to avoid MDC lookup when updating Prometheus metrics.
/// Returns true if worker_id was recorded for the first time.
pub fn record_decode_worker_full(
&self,
id: u64,
dp_rank: u32,
worker_type: &'static str,
) -> bool {
let is_new = self
.decode_worker_id
.compare_exchange(NO_WORKER_ID, id, Ordering::SeqCst, Ordering::SeqCst)
.is_ok();
if is_new {
self.decode_dp_rank.store(dp_rank, Ordering::SeqCst);
let _ = self.decode_worker_type.set(worker_type);
}
is_new
}
/// Set the request phase and return a permit that blocks subsequent phase changes. /// Set the request phase and return a permit that blocks subsequent phase changes.
/// ///
/// The returned permit must be dropped to allow the next `set_phase` call to proceed. /// The returned permit must be dropped to allow the next `set_phase` call to proceed.
/// Under normal operation, callers can simply ignore the returned permit (letting it /// In the bootstrap optimization path, the permit is held and passed to the spawned
/// drop immediately). In the bootstrap optimization path, the permit is held and /// prefill task, ensuring routing completes before the phase changes.
/// passed to the spawned prefill task, which drops it after `record_worker` completes.
///
/// This prevents the race condition where the phase changes to Decode before the
/// background prefill task has recorded its worker ID.
pub async fn set_phase(&self, phase: RequestPhase) -> OwnedSemaphorePermit { pub async fn set_phase(&self, phase: RequestPhase) -> OwnedSemaphorePermit {
let permit = self let permit = self
.phase_semaphore .phase_semaphore
...@@ -329,68 +309,31 @@ impl RequestTracker { ...@@ -329,68 +309,31 @@ impl RequestTracker {
*self.phase.lock() *self.phase.lock()
} }
/// Record worker ID based on the current phase.
///
/// - Prefill phase: records as prefill_worker_id
/// - Decode phase: records as decode_worker_id
/// - Aggregated phase: records as both prefill and decode worker
pub fn record_worker(&self, instance_id: u64) {
match self.phase() {
RequestPhase::Prefill => {
self.record_prefill_worker(instance_id);
}
RequestPhase::Decode => {
self.record_decode_worker(instance_id);
}
RequestPhase::Aggregated => {
self.record_prefill_worker(instance_id);
self.record_decode_worker(instance_id);
}
}
}
/// Record worker ID and DP rank based on the current phase.
///
/// - Prefill phase: records as prefill_worker_id/prefill_dp_rank
/// - Decode phase: records as decode_worker_id/decode_dp_rank
/// - Aggregated phase: records as both prefill and decode worker/rank
pub fn record_worker_with_rank(&self, instance_id: u64, dp_rank: u32) {
match self.phase() {
RequestPhase::Prefill => {
self.record_prefill_worker_with_rank(instance_id, dp_rank);
}
RequestPhase::Decode => {
self.record_decode_worker_with_rank(instance_id, dp_rank);
}
RequestPhase::Aggregated => {
self.record_prefill_worker_with_rank(instance_id, dp_rank);
self.record_decode_worker_with_rank(instance_id, dp_rank);
}
}
}
/// Record worker ID, DP rank, and worker type based on the current phase. /// Record worker ID, DP rank, and worker type based on the current phase.
/// ///
/// This is the preferred method when worker_type is known (from MDC or router config), /// Each slot is written exactly once by `KvPushRouter::generate()`:
/// as it stores the worker_type for later use in Prometheus metric updates without /// - Prefill phase: stores as prefill worker
/// requiring an expensive MDC lookup. /// - Decode phase: stores as decode worker
/// /// - Aggregated phase: stores as both prefill and decode worker
/// - Prefill phase: records as prefill worker with given worker_type
/// - Decode phase: records as decode worker with given worker_type
/// - Aggregated phase: records as both prefill and decode worker with the same worker_type
pub fn record_worker_full(&self, instance_id: u64, dp_rank: u32, worker_type: &'static str) { pub fn record_worker_full(&self, instance_id: u64, dp_rank: u32, worker_type: &'static str) {
match self.phase() { match self.phase() {
RequestPhase::Prefill => { RequestPhase::Prefill => {
self.record_prefill_worker_full(instance_id, dp_rank, worker_type); self.prefill_worker_id.store(instance_id, Ordering::Relaxed);
self.prefill_dp_rank.store(dp_rank, Ordering::Relaxed);
let _ = self.prefill_worker_type.set(worker_type);
} }
RequestPhase::Decode => { RequestPhase::Decode => {
self.record_decode_worker_full(instance_id, dp_rank, worker_type); self.decode_worker_id.store(instance_id, Ordering::Relaxed);
self.decode_dp_rank.store(dp_rank, Ordering::Relaxed);
let _ = self.decode_worker_type.set(worker_type);
} }
RequestPhase::Aggregated => { RequestPhase::Aggregated => {
// In aggregated mode, both prefill and decode happen on the same worker, self.prefill_worker_id.store(instance_id, Ordering::Relaxed);
// so we record the same worker_type for both self.prefill_dp_rank.store(dp_rank, Ordering::Relaxed);
self.record_prefill_worker_full(instance_id, dp_rank, worker_type); let _ = self.prefill_worker_type.set(worker_type);
self.record_decode_worker_full(instance_id, dp_rank, worker_type); self.decode_worker_id.store(instance_id, Ordering::Relaxed);
self.decode_dp_rank.store(dp_rank, Ordering::Relaxed);
let _ = self.decode_worker_type.set(worker_type);
} }
} }
} }
...@@ -454,6 +397,51 @@ impl RequestTracker { ...@@ -454,6 +397,51 @@ impl RequestTracker {
self.decode_worker_type.get().copied() self.decode_worker_type.get().copied()
} }
/// Write TTFT and ISL to per-worker last gauges using prefill worker labels.
/// Called from the Python binding path on first token.
pub fn observe_first_token_gauges(&self) {
let Some(worker_id) = self.prefill_worker_id() else {
return;
};
let worker_id_str = worker_id.to_string();
let dp_rank_str = self
.prefill_dp_rank()
.map_or("0".to_string(), |r| r.to_string());
let worker_type = self.prefill_worker_type().unwrap_or(WORKER_TYPE_PREFILL);
let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type];
if let Some(ttft) = self.ttft_ms() {
WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE
.with_label_values(labels)
.set(ttft / 1000.0);
}
if let Some(isl) = self.isl_tokens() {
WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
.with_label_values(labels)
.set(isl as i64);
}
}
/// Write avg ITL to per-worker last gauge using decode worker labels.
/// Called at each output block boundary and from the Python binding path.
pub fn observe_finish_gauges(&self) {
let Some(worker_id) = self.decode_worker_id() else {
return;
};
let worker_id_str = worker_id.to_string();
let dp_rank_str = self
.decode_dp_rank()
.map_or("0".to_string(), |r| r.to_string());
let worker_type = self.decode_worker_type().unwrap_or(WORKER_TYPE_DECODE);
let labels = &[worker_id_str.as_str(), dp_rank_str.as_str(), worker_type];
if let Some(avg_itl) = self.avg_itl_ms() {
WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
.with_label_values(labels)
.set(avg_itl / 1000.0);
}
}
pub fn get_timing_info(&self) -> TimingInfo { pub fn get_timing_info(&self) -> TimingInfo {
TimingInfo { TimingInfo {
request_received_ms: self.request_received_epoch_ms, request_received_ms: self.request_received_epoch_ms,
...@@ -501,3 +489,163 @@ pub struct TimingInfo { ...@@ -501,3 +489,163 @@ pub struct TimingInfo {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub kv_hit_rate: Option<f64>, pub kv_hit_rate: Option<f64>,
} }
#[cfg(test)]
mod tests {
use super::*;
use std::thread;
use std::time::Duration;
#[test]
fn test_record_isl_osl() {
let tracker = RequestTracker::new();
tracker.record_isl(512, 256);
assert_eq!(tracker.isl_tokens(), Some(512));
assert_eq!(tracker.cached_tokens(), Some(256));
tracker.record_osl(100);
assert_eq!(tracker.osl_tokens(), 100);
}
#[test]
fn test_ttft_ms() {
let tracker = RequestTracker::new();
thread::sleep(Duration::from_millis(10));
tracker.record_first_token();
let ttft = tracker.ttft_ms().unwrap();
assert!(ttft >= 5.0, "TTFT should be at least 5ms, got {ttft}");
}
#[test]
fn test_ttft_ms_none_before_first_token() {
let tracker = RequestTracker::new();
assert!(tracker.ttft_ms().is_none());
}
#[test]
fn test_avg_itl_ms() {
let tracker = RequestTracker::new();
tracker.record_first_token();
thread::sleep(Duration::from_millis(20));
tracker.record_osl(11); // 11 tokens => 10 inter-token gaps
tracker.record_finish();
let itl = tracker.avg_itl_ms().unwrap();
assert!(itl > 0.0, "avg ITL should be positive, got {itl}");
}
#[test]
fn test_avg_itl_ms_none_with_single_token() {
let tracker = RequestTracker::new();
tracker.record_first_token();
tracker.record_osl(1);
tracker.record_finish();
assert!(
tracker.avg_itl_ms().is_none(),
"avg ITL should be None with < 2 output tokens"
);
}
#[test]
fn test_kv_hit_rate() {
let tracker = RequestTracker::new();
tracker.record_kv_hit(3, 10);
let rate = tracker.kv_hit_rate().unwrap();
assert!(
(rate - 0.3).abs() < f64::EPSILON,
"KV hit rate should be 0.3, got {rate}"
);
}
#[test]
fn test_kv_hit_rate_zero_isl() {
let tracker = RequestTracker::new();
tracker.record_kv_hit(0, 0);
assert!(
tracker.kv_hit_rate().is_none(),
"KV hit rate should be None when isl_blocks is 0"
);
}
#[test]
fn test_total_time_ms() {
let tracker = RequestTracker::new();
thread::sleep(Duration::from_millis(10));
tracker.record_finish();
let total = tracker.total_time_ms().unwrap();
assert!(
total >= 5.0,
"total time should be at least 5ms, got {total}"
);
}
#[test]
fn test_observe_first_token_gauges_no_panic_without_worker() {
let tracker = RequestTracker::new();
tracker.record_first_token();
tracker.record_isl(100, 50);
// No worker recorded — should return early without panic
tracker.observe_first_token_gauges();
}
#[test]
fn test_observe_finish_gauges_no_panic_without_worker() {
let tracker = RequestTracker::new();
tracker.record_first_token();
tracker.record_osl(10);
tracker.record_finish();
// No worker recorded — should return early without panic
tracker.observe_finish_gauges();
}
#[test]
fn test_observe_first_token_gauges_with_worker() {
let tracker = RequestTracker::new();
tracker.record_worker_full(42, 0, WORKER_TYPE_PREFILL);
thread::sleep(Duration::from_millis(5));
tracker.record_first_token();
tracker.record_isl(256, 128);
tracker.observe_first_token_gauges();
let labels = &["42", "0", WORKER_TYPE_PREFILL];
let ttft_val = WORKER_LAST_TIME_TO_FIRST_TOKEN_GAUGE
.with_label_values(labels)
.get();
assert!(
ttft_val > 0.0,
"TTFT gauge should be positive after observe, got {ttft_val}"
);
let isl_val = WORKER_LAST_INPUT_SEQUENCE_TOKENS_GAUGE
.with_label_values(labels)
.get();
assert_eq!(isl_val, 256, "ISL gauge should be 256, got {isl_val}");
}
#[test]
fn test_observe_finish_gauges_with_worker() {
let tracker = RequestTracker::new();
tracker.record_worker_full(99, 1, WORKER_TYPE_DECODE);
tracker.record_first_token();
thread::sleep(Duration::from_millis(10));
tracker.record_osl(5);
tracker.record_finish();
tracker.observe_finish_gauges();
let labels = &["99", "1", WORKER_TYPE_DECODE];
let itl_val = WORKER_LAST_INTER_TOKEN_LATENCY_GAUGE
.with_label_values(labels)
.get();
assert!(
itl_val > 0.0,
"ITL gauge should be positive after observe, got {itl_val}"
);
}
}
...@@ -429,11 +429,6 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes ...@@ -429,11 +429,6 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateChatCompletionStreamRes
delta.stop_reason, delta.stop_reason,
); );
// Record first token time (only succeeds on first call due to OnceLock)
if let Some(ref tracker) = self.tracker {
tracker.record_first_token();
}
// Get worker_id info from tracker (set by KvPushRouter based on phase) // Get worker_id info from tracker (set by KvPushRouter based on phase)
let worker_id_info = self.tracker.as_ref().and_then(|t| t.get_worker_info()); let worker_id_info = self.tracker.as_ref().and_then(|t| t.get_worker_info());
......
...@@ -323,11 +323,6 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for ...@@ -323,11 +323,6 @@ impl crate::protocols::openai::DeltaGeneratorExt<NvCreateCompletionResponse> for
let index = delta.index.unwrap_or(0); let index = delta.index.unwrap_or(0);
let mut response = self.create_choice(index, delta.text.clone(), finish_reason, logprobs); let mut response = self.create_choice(index, delta.text.clone(), finish_reason, logprobs);
// Record first token time (only succeeds on first call due to OnceLock)
if let Some(ref tracker) = self.tracker {
tracker.record_first_token();
}
// Get worker_id info from tracker (set by KvPushRouter based on phase) // Get worker_id info from tracker (set by KvPushRouter based on phase)
let worker_id_info = self.tracker.as_ref().and_then(|t| t.get_worker_info()); let worker_id_info = self.tracker.as_ref().and_then(|t| t.get_worker_info());
......
...@@ -68,6 +68,9 @@ pub mod name_prefix { ...@@ -68,6 +68,9 @@ pub mod name_prefix {
/// Prefix for frontend service metrics /// Prefix for frontend service metrics
pub const FRONTEND: &str = "dynamo_frontend"; pub const FRONTEND: &str = "dynamo_frontend";
/// Prefix for routing overhead metrics (raw Prometheus, not component-scoped)
pub const ROUTING_OVERHEAD: &str = "dynamo_routing_overhead";
} }
/// Automatically inserted Prometheus label names used across the metrics system /// Automatically inserted Prometheus label names used across the metrics system
...@@ -93,6 +96,9 @@ pub mod labels { ...@@ -93,6 +96,9 @@ pub mod labels {
/// It is used by worker/load-style metrics that need to disambiguate per-worker series. /// It is used by worker/load-style metrics that need to disambiguate per-worker series.
pub const DP_RANK: &str = "dp_rank"; pub const DP_RANK: &str = "dp_rank";
/// Label for worker instance ID (etcd lease ID).
pub const WORKER_ID: &str = "worker_id";
/// Label for model name/path (OpenAI API standard, injected by Dynamo) /// Label for model name/path (OpenAI API standard, injected by Dynamo)
/// This is the standard label name injected by all backends in metrics_labels=[("model", ...)]. /// This is the standard label name injected by all backends in metrics_labels=[("model", ...)].
/// Ensures compatibility with OpenAI-compatible tooling. /// Ensures compatibility with OpenAI-compatible tooling.
...@@ -367,6 +373,27 @@ pub mod kvbm { ...@@ -367,6 +373,27 @@ pub mod kvbm {
pub const OBJECT_WRITE_FAILURES: &str = "object_write_failures"; pub const OBJECT_WRITE_FAILURES: &str = "object_write_failures";
} }
/// Routing overhead phase latency histogram names (raw Prometheus, not component-scoped).
///
/// These are combined with [`name_prefix::ROUTING_OVERHEAD`] to form full metric names,
/// e.g. `dynamo_routing_overhead_block_hashing_ms`.
pub mod routing_overhead {
/// Time spent computing block hashes
pub const BLOCK_HASHING_MS: &str = "block_hashing_ms";
/// Time spent in indexer find_matches
pub const INDEXER_FIND_MATCHES_MS: &str = "indexer_find_matches_ms";
/// Time spent computing sequence hashes
pub const SEQ_HASHING_MS: &str = "seq_hashing_ms";
/// Time spent in scheduler worker selection
pub const SCHEDULING_MS: &str = "scheduling_ms";
/// Total routing overhead per request
pub const TOTAL_MS: &str = "total_ms";
}
// KvRouter (including KvInexer) Prometheus metric names // KvRouter (including KvInexer) Prometheus metric names
pub mod kvrouter { pub mod kvrouter {
/// Number of KV cache events applied to the index (including status) /// Number of KV cache events applied to the index (including status)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment