"launch/vscode:/vscode.git/clone" did not exist on "41ff394f511f09c00240874fd46c352d64c04bd2"
Unverified Commit bba26d55 authored by Thomas Montfort's avatar Thomas Montfort Committed by GitHub
Browse files

fix(metrics): widen histogram buckets for request duration and router overhead (#8087)


Signed-off-by: default avatartmontfort <tmontfort@nvidia.com>
Signed-off-by: default avatarThomas Montfort <tmontfort@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 243d7f07
......@@ -521,7 +521,7 @@ impl Metrics {
// Request duration buckets: configurable via DYN_METRICS_REQUEST_DURATION_{MIN,MAX,COUNT}
let (req_dur_min, req_dur_max, req_dur_count) =
parse_bucket_config("DYN_METRICS_REQUEST_DURATION", 1.0, 256.0, 10);
parse_bucket_config("DYN_METRICS_REQUEST_DURATION", 1.0, 512.0, 10);
let request_duration_buckets =
generate_log_buckets(req_dur_min, req_dur_max, req_dur_count);
......
......@@ -56,10 +56,14 @@ use prometheus::{HistogramOpts, IntGaugeVec, Opts};
use crate::http::service::metrics::generate_log_buckets;
/// Exponential buckets for routing overhead histograms:
/// from 0.0001 ms (0.1 µs) to ~13.1 ms, factor 2, 18 steps.
fn overhead_buckets() -> Vec<f64> {
prometheus::exponential_buckets(0.0001, 2.0, 18).expect("exponential buckets should not fail")
/// Buckets for CPU-bound compute phases (block hashing, sequence hashing).
fn compute_overhead_buckets() -> Vec<f64> {
prometheus::exponential_buckets(0.001, 2.0, 15).unwrap()
}
/// Buckets for async phases (indexer find_matches, scheduling, total).
fn async_overhead_buckets() -> Vec<f64> {
prometheus::exponential_buckets(0.01, 3.0, 17).unwrap()
}
// ---------------------------------------------------------------------------
......@@ -219,39 +223,45 @@ impl RoutingOverheadMetrics {
instance_id: u64,
) -> Result<(), prometheus::Error> {
let m = ROUTING_OVERHEAD_METRICS.get_or_init(|| {
let buckets = overhead_buckets();
let compute_buckets = compute_overhead_buckets();
let async_buckets = async_overhead_buckets();
let router_id = instance_id.to_string();
let make = |suffix: &str, help: &str| {
let make = |suffix: &str, help: &str, buckets: Vec<f64>| {
let name = format!("{}_{}", name_prefix::ROUTER, suffix);
prometheus::Histogram::with_opts(
HistogramOpts::new(name, help)
.const_label(labels::ROUTER_ID, &router_id)
.buckets(buckets.clone()),
.buckets(buckets),
)
};
let block_hashing = make(
routing_overhead::BLOCK_HASHING_MS,
"Time spent computing block hashes in milliseconds",
compute_buckets.clone(),
)
.expect("overhead_block_hashing_ms");
let indexer_find_matches = make(
routing_overhead::INDEXER_FIND_MATCHES_MS,
"Time spent in indexer find_matches in milliseconds",
async_buckets.clone(),
)
.expect("overhead_indexer_find_matches_ms");
let seq_hashing = make(
routing_overhead::SEQ_HASHING_MS,
"Time spent computing sequence hashes in milliseconds",
compute_buckets,
)
.expect("overhead_seq_hashing_ms");
let scheduling = make(
routing_overhead::SCHEDULING_MS,
"Time spent in scheduler worker selection in milliseconds",
async_buckets.clone(),
)
.expect("overhead_scheduling_ms");
let total = make(
routing_overhead::TOTAL_MS,
"Total routing overhead per request in milliseconds",
async_buckets,
)
.expect("overhead_total_ms");
Arc::new(Self {
......@@ -600,7 +610,7 @@ dynamo_frontend_router_queue_pending_requests{worker_type=\"decode\"} 5
// Verify the overhead constants produce valid histogram names when
// combined with dynamo_router_ prefix.
let registry = prometheus::Registry::new();
let buckets = overhead_buckets();
let buckets = async_overhead_buckets();
let prefix = name_prefix::ROUTER;
let name = format!("{}_{}", prefix, routing_overhead::TOTAL_MS);
let total = prometheus::Histogram::with_opts(
......
......@@ -36,7 +36,7 @@ pub static WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS: Lazy<Histogram> = Lazy::
"Backend processing time from handle_payload entry to prologue sent (seconds)",
)
.buckets(vec![
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
]),
)
.expect("work_handler_time_to_first_response_seconds histogram")
......
......@@ -61,11 +61,18 @@ impl WorkHandlerMetrics {
metrics_labels,
)?;
// Custom buckets for inference workloads: retain sub-second resolution for
// fast operations, extend well beyond the default 10s ceiling to capture
// long-running generation requests that can last minutes.
let request_duration_buckets = vec![
0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0, 60.0, 120.0,
300.0, 600.0,
];
let request_duration = metrics.create_histogram(
work_handler::REQUEST_DURATION_SECONDS,
"Time spent processing requests by work handler",
metrics_labels,
None,
Some(request_duration_buckets),
)?;
let inflight_requests = metrics.create_intgauge(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment