fix(metrics): widen histogram buckets for request duration and router overhead (#8087)

Signed-off-by: tmontfort <tmontfort@nvidia.com> Signed-off-by: Thomas Montfort <tmontfort@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

fix(metrics): widen histogram buckets for request duration and router overhead (#8087)
Signed-off-by: tmontfort <tmontfort@nvidia.com> Signed-off-by: Thomas Montfort <tmontfort@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
bba26d55 · Thomas Montfort · GitHub · 243d7f07 · bba26d55 · bba26d55
Unverified Commit bba26d55 authored Apr 15, 2026 by Thomas Montfort Committed by GitHub Apr 15, 2026
4 changed files
--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -521,7 +521,7 @@ impl Metrics {

        // Request duration buckets: configurable via DYN_METRICS_REQUEST_DURATION_{MIN,MAX,COUNT}
        let (req_dur_min, req_dur_max, req_dur_count) =
-            parse_bucket_config("DYN_METRICS_REQUEST_DURATION", 1.0, 256.0, 10);
+            parse_bucket_config("DYN_METRICS_REQUEST_DURATION", 1.0, 512.0, 10);
        let request_duration_buckets =
            generate_log_buckets(req_dur_min, req_dur_max, req_dur_count);


--- a/lib/llm/src/kv_router/metrics.rs
+++ b/lib/llm/src/kv_router/metrics.rs
@@ -56,10 +56,14 @@ use prometheus::{HistogramOpts, IntGaugeVec, Opts};

 use crate::http::service::metrics::generate_log_buckets;

-/// Exponential buckets for routing overhead histograms:
-/// from 0.0001 ms (0.1 µs) to ~13.1 ms, factor 2, 18 steps.
-fn overhead_buckets() -> Vec<f64> {
-    prometheus::exponential_buckets(0.0001, 2.0, 18).expect("exponential buckets should not fail")
+/// Buckets for CPU-bound compute phases (block hashing, sequence hashing).
+fn compute_overhead_buckets() -> Vec<f64> {
+    prometheus::exponential_buckets(0.001, 2.0, 15).unwrap()
+}
+
+/// Buckets for async phases (indexer find_matches, scheduling, total).
+fn async_overhead_buckets() -> Vec<f64> {
+    prometheus::exponential_buckets(0.01, 3.0, 17).unwrap()
 }

 // ---------------------------------------------------------------------------
@@ -219,39 +223,45 @@ impl RoutingOverheadMetrics {
        instance_id: u64,
    ) -> Result<(), prometheus::Error> {
        let m = ROUTING_OVERHEAD_METRICS.get_or_init(|| {
-            let buckets = overhead_buckets();
+            let compute_buckets = compute_overhead_buckets();
+            let async_buckets = async_overhead_buckets();
            let router_id = instance_id.to_string();
-            let make = |suffix: &str, help: &str| {
+            let make = |suffix: &str, help: &str, buckets: Vec<f64>| {
                let name = format!("{}_{}", name_prefix::ROUTER, suffix);
                prometheus::Histogram::with_opts(
                    HistogramOpts::new(name, help)
                        .const_label(labels::ROUTER_ID, &router_id)
-                        .buckets(buckets.clone()),
+                        .buckets(buckets),
                )
            };
            let block_hashing = make(
                routing_overhead::BLOCK_HASHING_MS,
                "Time spent computing block hashes in milliseconds",
+                compute_buckets.clone(),
            )
            .expect("overhead_block_hashing_ms");
            let indexer_find_matches = make(
                routing_overhead::INDEXER_FIND_MATCHES_MS,
                "Time spent in indexer find_matches in milliseconds",
+                async_buckets.clone(),
            )
            .expect("overhead_indexer_find_matches_ms");
            let seq_hashing = make(
                routing_overhead::SEQ_HASHING_MS,
                "Time spent computing sequence hashes in milliseconds",
+                compute_buckets,
            )
            .expect("overhead_seq_hashing_ms");
            let scheduling = make(
                routing_overhead::SCHEDULING_MS,
                "Time spent in scheduler worker selection in milliseconds",
+                async_buckets.clone(),
            )
            .expect("overhead_scheduling_ms");
            let total = make(
                routing_overhead::TOTAL_MS,
                "Total routing overhead per request in milliseconds",
+                async_buckets,
            )
            .expect("overhead_total_ms");
            Arc::new(Self {
@@ -600,7 +610,7 @@ dynamo_frontend_router_queue_pending_requests{worker_type=\"decode\"} 5
        // Verify the overhead constants produce valid histogram names when
        // combined with dynamo_router_ prefix.
        let registry = prometheus::Registry::new();
-        let buckets = overhead_buckets();
+        let buckets = async_overhead_buckets();
        let prefix = name_prefix::ROUTER;
        let name = format!("{}_{}", prefix, routing_overhead::TOTAL_MS);
        let total = prometheus::Histogram::with_opts(

--- a/lib/runtime/src/metrics/work_handler_perf.rs
+++ b/lib/runtime/src/metrics/work_handler_perf.rs
@@ -36,7 +36,7 @@ pub static WORK_HANDLER_TIME_TO_FIRST_RESPONSE_SECONDS: Lazy<Histogram> = Lazy::
            "Backend processing time from handle_payload entry to prologue sent (seconds)",
        )
        .buckets(vec![
-            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
+            0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0,
        ]),
    )
    .expect("work_handler_time_to_first_response_seconds histogram")

--- a/lib/runtime/src/pipeline/network/ingress/push_handler.rs
+++ b/lib/runtime/src/pipeline/network/ingress/push_handler.rs
@@ -61,11 +61,18 @@ impl WorkHandlerMetrics {
            metrics_labels,
        )?;

+        // Custom buckets for inference workloads: retain sub-second resolution for
+        // fast operations, extend well beyond the default 10s ceiling to capture
+        // long-running generation requests that can last minutes.
+        let request_duration_buckets = vec![
+            0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 20.0, 30.0, 60.0, 120.0,
+            300.0, 600.0,
+        ];
        let request_duration = metrics.create_histogram(
            work_handler::REQUEST_DURATION_SECONDS,
            "Time spent processing requests by work handler",
            metrics_labels,
-            None,
+            Some(request_duration_buckets),
        )?;

        let inflight_requests = metrics.create_intgauge(