feat: Metric histogram buckets configureable from env (#3562)

Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com>

feat: Metric histogram buckets configureable from env (#3562)
Signed-off-by: William Arnold <7565007+Aphoh@users.noreply.github.com>
f0065bb4 · William Arnold · GitHub · d4ff6f0a · f0065bb4 · f0065bb4
Unverified Commit f0065bb4 authored Oct 13, 2025 by William Arnold Committed by GitHub Oct 13, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 342 additions and 32 deletions

lib/llm/src/http/service/metrics.rs lib/llm/src/http/service/metrics.rs +330 -16

lib/llm/tests/http-service.rs lib/llm/tests/http-service.rs +12 -16

No files found.
--- a/lib/llm/src/http/service/metrics.rs
+++ b/lib/llm/src/http/service/metrics.rs
@@ -26,6 +26,128 @@ pub use prometheus::Registry;
 use super::RouteDoc;
+/// Generate log-spaced histogram buckets with values rounded to 2 significant figures.
+///
+/// # Arguments
+/// * `min` - Minimum value for the buckets (must be > 0 for log spacing)
+/// * `max` - Maximum value for the buckets (must be > min)
+/// * `count` - Number of buckets to generate
+///
+/// # Returns
+/// A vector of log-spaced values, always starting with 0.0 and ending with the rounded max value.
+/// Duplicates created by rounding are removed, so the final count may be less than requested.
+///
+/// # Note
+/// With 2 significant figures, there are roughly 90 unique values per order of magnitude.
+/// Requesting more buckets than can be uniquely represented will result in deduplication.
+fn generate_log_buckets(min: f64, max: f64, count: usize) -> Vec<f64> {
+    if count == 0 {
+        return vec![];
+    }
+    if count == 1 {
+        return vec![0.0];
+    }
+    let requested_count = count;
+    let mut buckets = Vec::with_capacity(count);
+    buckets.push(0.0);
+    // Generate log-spaced values from min to max
+    for i in 1..count {
+        let log_min = min.ln();
+        let log_max = max.ln();
+        let log_value = log_min + (log_max - log_min) * (i as f64) / ((count - 1) as f64);
+        let value = log_value.exp();
+        buckets.push(round_to_sig_figs(value, 2));
+    }
+    // Remove consecutive duplicates (buckets are already sorted)
+    let original_len = buckets.len();
+    buckets.dedup();
+    // Warn if significant deduplication occurred
+    if buckets.len() < original_len && (original_len - buckets.len()) > original_len / 10 {
+        tracing::warn!(
+            requested = requested_count,
+            unique = buckets.len(),
+            duplicates = original_len - buckets.len(),
+            min = min,
+            max = max,
+            "Histogram bucket generation: Significant duplicate values after rounding to 2 sig figs. \
+             Consider reducing bucket count or increasing range."
+        );
+    }
+    buckets
+}
+/// Round a number to a specified number of significant figures
+fn round_to_sig_figs(value: f64, sig_figs: u32) -> f64 {
+    if value == 0.0 {
+        return 0.0;
+    }
+    let magnitude = value.abs().log10().floor();
+    let scale = 10_f64.powf(sig_figs as f64 - 1.0 - magnitude);
+    (value * scale).round() / scale
+}
+const MAX_BUCKET_COUNT: usize = 512;
+fn validate_bucket_config(min: f64, max: f64, count: usize) -> bool {
+    min.is_finite()
+        && max.is_finite()
+        && min > 0.0
+        && min < max
+        && count > 0
+        && count <= MAX_BUCKET_COUNT
+}
+/// Parse histogram bucket configuration from environment variables
+/// Returns (min, max, count) with defaults if not specified
+fn parse_bucket_config(
+    env_prefix: &str,
+    default_min: f64,
+    default_max: f64,
+    default_count: usize,
+) -> (f64, f64, usize) {
+    if !validate_bucket_config(default_min, default_max, default_count) {
+        tracing::error!(
+            default_min,
+            default_max,
+            default_count,
+            "Invalid default histogram configuration"
+        );
+        return (1.0, 10.0, 10);
+    }
+    let mut min = std::env::var(format!("{env_prefix}_MIN"))
+        .ok()
+        .and_then(|s| s.parse::<f64>().ok())
+        .unwrap_or(default_min);
+    let mut max = std::env::var(format!("{env_prefix}_MAX"))
+        .ok()
+        .and_then(|s| s.parse::<f64>().ok())
+        .unwrap_or(default_max);
+    let mut count = std::env::var(format!("{env_prefix}_COUNT"))
+        .ok()
+        .and_then(|s| s.parse::<usize>().ok())
+        .unwrap_or(default_count);
+    if !validate_bucket_config(min, max, count) {
+        tracing::warn!(
+            min=%min,
+            max=%max,
+            count=%count,
+            "Invalid histogram configuration given, using defaults"
+        );
+        min = default_min;
+        max = default_max;
+        count = default_count;
+    }
+    (min, max, count)
+}
 /// State for metrics handler with custom backend support
 struct MetricsHandlerState {
    registry: Arc<Registry>,
@@ -147,6 +269,17 @@ impl Metrics {
    /// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
    /// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
    ///
+    /// ## Histogram Bucket Configuration
+    ///
+    /// All histograms use log-spaced buckets rounded to 2 significant figures. Bucket configuration
+    /// can be customized via environment variables (MIN: minimum value, MAX: maximum value, COUNT: number of buckets):
+    ///
+    /// - `DYN_METRICS_REQUEST_DURATION_{MIN,MAX,COUNT}` - Request duration histogram (defaults: 1.0, 256.0, 10)
+    /// - `DYN_METRICS_INPUT_SEQUENCE_{MIN,MAX,COUNT}` - Input sequence length histogram (defaults: 50.0, 128000.0, 12)
+    /// - `DYN_METRICS_OUTPUT_SEQUENCE_{MIN,MAX,COUNT}` - Output sequence length histogram (defaults: 50.0, 32000.0, 10)
+    /// - `DYN_METRICS_TTFT_{MIN,MAX,COUNT}` - Time to first token histogram (defaults: 0.001, 480.0, 18)
+    /// - `DYN_METRICS_ITL_{MIN,MAX,COUNT}` - Inter-token latency histogram (defaults: 0.001, 2.0, 13)
+    ///
    /// ## Model Configuration Metrics
    ///
    /// Runtime config metrics (from ModelRuntimeConfig):
@@ -213,64 +346,77 @@ impl Metrics {
        )
        .unwrap();
-        let buckets = vec![0.0, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0];
+        // Request duration buckets: configurable via DYN_METRICS_REQUEST_DURATION_{MIN,MAX,COUNT}
+        let (req_dur_min, req_dur_max, req_dur_count) =
+            parse_bucket_config("DYN_METRICS_REQUEST_DURATION", 1.0, 256.0, 10);
+        let request_duration_buckets =
+            generate_log_buckets(req_dur_min, req_dur_max, req_dur_count);
        let request_duration = HistogramVec::new(
            HistogramOpts::new(
                frontend_metric_name(frontend_service::REQUEST_DURATION_SECONDS),
                "Duration of LLM requests",
            )
-            .buckets(buckets),
+            .buckets(request_duration_buckets),
            &["model"],
        )
        .unwrap();
+        // Input sequence length buckets: configurable via DYN_METRICS_INPUT_SEQUENCE_{MIN,MAX,COUNT}
+        let (isl_min, isl_max, isl_count) =
+            parse_bucket_config("DYN_METRICS_INPUT_SEQUENCE", 50.0, 128000.0, 12);
+        let input_sequence_buckets = generate_log_buckets(isl_min, isl_max, isl_count);
        let input_sequence_length = HistogramVec::new(
            HistogramOpts::new(
                frontend_metric_name(frontend_service::INPUT_SEQUENCE_TOKENS),
                "Input sequence length in tokens",
            )
-            .buckets(vec![
+            .buckets(input_sequence_buckets),
-                0.0, 50.0, 100.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0, 16000.0, 32000.0, 64000.0,
-                128000.0,
-            ]),
            &["model"],
        )
        .unwrap();
+        // Output sequence length buckets: configurable via DYN_METRICS_OUTPUT_SEQUENCE_{MIN,MAX,COUNT}
+        let (osl_min, osl_max, osl_count) =
+            parse_bucket_config("DYN_METRICS_OUTPUT_SEQUENCE", 50.0, 32000.0, 10);
+        let output_sequence_buckets = generate_log_buckets(osl_min, osl_max, osl_count);
        let output_sequence_length = HistogramVec::new(
            HistogramOpts::new(
                frontend_metric_name(frontend_service::OUTPUT_SEQUENCE_TOKENS),
                "Output sequence length in tokens",
            )
-            .buckets(vec![
+            .buckets(output_sequence_buckets),
-                0.0, 50.0, 100.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0, 16000.0, 32000.0,
-            ]),
            &["model"],
        )
        .unwrap();
+        // Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
+        let (ttft_min, ttft_max, ttft_count) =
+            parse_bucket_config("DYN_METRICS_TTFT", 0.001, 480.0, 18);
+        let time_to_first_token_buckets = generate_log_buckets(ttft_min, ttft_max, ttft_count);
        let time_to_first_token = HistogramVec::new(
            HistogramOpts::new(
                frontend_metric_name(frontend_service::TIME_TO_FIRST_TOKEN_SECONDS),
                "Time to first token in seconds",
            )
-            .buckets(vec![
+            .buckets(time_to_first_token_buckets),
-                0.0, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0,
-                60.0, 120.0, 240.0, 480.0,
-            ]),
            &["model"],
        )
        .unwrap();
+        // Inter-token latency buckets: configurable via DYN_METRICS_ITL_{MIN,MAX,COUNT}
+        let (itl_min, itl_max, itl_count) = parse_bucket_config("DYN_METRICS_ITL", 0.001, 2.0, 13);
+        let inter_token_latency_buckets = generate_log_buckets(itl_min, itl_max, itl_count);
        let inter_token_latency = HistogramVec::new(
            HistogramOpts::new(
                frontend_metric_name(frontend_service::INTER_TOKEN_LATENCY_SECONDS),
                "Inter-token latency in seconds",
            )
-            .buckets(vec![
+            .buckets(inter_token_latency_buckets),
-                0.0, 0.001, 0.005, 0.01, 0.015, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.0,
-            ]),
            &["model"],
        )
        .unwrap();
@@ -874,3 +1020,171 @@ async fn handler_metrics(State(state): State<Arc<MetricsHandlerState>>) -> impl
    (StatusCode::OK, metrics).into_response()
 }
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_round_to_sig_figs() {
+        // Test rounding to 2 significant figures
+        assert_eq!(round_to_sig_figs(0.0026, 2), 0.0026);
+        assert_eq!(round_to_sig_figs(0.26, 2), 0.26);
+        assert_eq!(round_to_sig_figs(0.2356, 2), 0.24);
+        assert_eq!(round_to_sig_figs(1.234, 2), 1.2);
+        assert_eq!(round_to_sig_figs(12.34, 2), 12.0);
+        assert_eq!(round_to_sig_figs(123.4, 2), 120.0);
+        assert_eq!(round_to_sig_figs(1234.0, 2), 1200.0);
+        assert_eq!(round_to_sig_figs(0.0, 2), 0.0);
+        // Test edge cases
+        assert_eq!(round_to_sig_figs(0.999, 2), 1.0);
+        assert_eq!(round_to_sig_figs(9.99, 2), 10.0);
+        assert_eq!(round_to_sig_figs(99.9, 2), 100.0);
+    }
+    #[test]
+    fn test_generate_log_buckets_basic() {
+        // Test basic properties
+        let buckets = generate_log_buckets(1.0, 100.0, 5);
+        // Check length
+        assert_eq!(buckets.len(), 5);
+        // Check first value is 0
+        assert_eq!(buckets[0], 0.0);
+        // Check last value is approximately max (rounded to 2 sig figs)
+        assert_eq!(buckets[buckets.len() - 1], 100.0);
+        // Check values are increasing
+        for i in 1..buckets.len() {
+            assert!(
+                buckets[i] > buckets[i - 1],
+                "Bucket values should be increasing: {} <= {}",
+                buckets[i - 1],
+                buckets[i]
+            );
+        }
+    }
+    #[test]
+    fn test_generate_log_buckets_edge_cases() {
+        // Test empty buckets
+        let buckets = generate_log_buckets(1.0, 100.0, 0);
+        assert_eq!(buckets.len(), 0);
+        // Test single bucket
+        let buckets = generate_log_buckets(1.0, 100.0, 1);
+        assert_eq!(buckets.len(), 1);
+        assert_eq!(buckets[0], 0.0);
+        // Test two buckets
+        let buckets = generate_log_buckets(1.0, 100.0, 2);
+        assert_eq!(buckets.len(), 2);
+        assert_eq!(buckets[0], 0.0);
+        assert_eq!(buckets[1], 100.0);
+    }
+    #[test]
+    fn test_generate_log_buckets_always_includes_zero() {
+        // Test various configurations
+        for count in 1..=20 {
+            let buckets = generate_log_buckets(0.1, 1000.0, count);
+            assert_eq!(
+                buckets[0], 0.0,
+                "First bucket should always be 0.0 for count={}",
+                count
+            );
+        }
+    }
+    #[test]
+    fn test_all_buckets_are_two_sig_figs() {
+        let test_cases = vec![
+            (1.0, 256.0, 10),
+            (50.0, 128000.0, 12),
+            (50.0, 32000.0, 10),
+            (0.001, 480.0, 18),
+            (0.001, 2.0, 13),
+        ];
+        for (min, max, count) in test_cases {
+            let buckets = generate_log_buckets(min, max, count);
+            for &value in buckets.iter().skip(1) {
+                let rounded = round_to_sig_figs(value, 2);
+                assert_eq!(
+                    value, rounded,
+                    "Value {} should be rounded to 2 sig figs (min={}, max={}, count={})",
+                    value, min, max, count
+                );
+            }
+        }
+    }
+    #[test]
+    fn test_sig_fig_limitation_with_many_buckets() {
+        // This test demonstrates that 2 sig figs limits the number of unique bucket values
+        // With 1000 requested buckets but only 2 sig figs, we'll get automatic deduplication
+        let buckets = generate_log_buckets(0.0001, 1.0, 1000);
+        println!(
+            "Requested 1000 buckets, got {} total values (including 0.0)",
+            buckets.len()
+        );
+        // With 2 sig figs across 4 orders of magnitude (0.0001 to 1.0),
+        // we can have roughly 90 unique values per order of magnitude
+        // So we expect around 360 unique values maximum
+        assert!(
+            buckets.len() < 500,
+            "Expected fewer than 500 unique buckets due to 2 sig fig limitation, got {}",
+            buckets.len()
+        );
+        // Verify all values are unique (no duplicates remain after deduplication)
+        let mut sorted_buckets = buckets.clone();
+        sorted_buckets.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        sorted_buckets.dedup();
+        assert_eq!(
+            buckets.len(),
+            sorted_buckets.len(),
+            "All buckets should be unique after deduplication"
+        );
+        // Verify first is still 0.0
+        assert_eq!(buckets[0], 0.0);
+        // Verify values are still in increasing order
+        for i in 1..buckets.len() {
+            assert!(
+                buckets[i] > buckets[i - 1],
+                "Buckets should be in increasing order"
+            );
+        }
+    }
+    #[test]
+    fn test_deduplication_preserves_order() {
+        // Test that deduplication maintains increasing order
+        let buckets = generate_log_buckets(0.01, 1.0, 50);
+        // Verify all values are unique
+        let mut unique_check = std::collections::HashSet::new();
+        for &bucket in &buckets {
+            assert!(
+                unique_check.insert(bucket.to_bits()),
+                "Duplicate value {} found after deduplication",
+                bucket
+            );
+        }
+        // Verify order is maintained
+        for i in 1..buckets.len() {
+            assert!(
+                buckets[i] > buckets[i - 1],
+                "Bucket values should be in increasing order after deduplication"
+            );
+        }
+    }
+}
--- a/lib/llm/tests/http-service.rs
+++ b/lib/llm/tests/http-service.rs
@@ -384,26 +384,22 @@ async fn test_http_service() {
    let buckets = histogram.get_bucket();
    let mut found = false;
+    let mut expected_count = 0;
-    for bucket in buckets {
+    for bucket_idx in 1..buckets.len() {
-        let upper_bound = bucket.get_upper_bound();
+        if buckets[bucket_idx].get_upper_bound() >= 2.5
-        let cumulative_count = bucket.get_cumulative_count();
+            && buckets[bucket_idx - 1].get_upper_bound() < 2.5
+        {
-        println!(
+            found = true;
-            "Bucket upper bound: {}, count: {}",
-            upper_bound, cumulative_count
-        );
-        // Since our observation is 2.5, it should fall into the bucket with upper bound 4.0
-        if upper_bound >= 4.0 {
            assert_eq!(
-                cumulative_count, 1,
+                buckets[bucket_idx].get_cumulative_count(),
-                "Observation should be counted in the 4.0 bucket"
+                1,
+                "Observation should be counted in the bucket containing 2.5"
            );
-            found = true;
+            expected_count = 1;
        } else {
            assert_eq!(
-                cumulative_count, 0,
+                buckets[bucket_idx].get_cumulative_count(),
+                expected_count,
                "No observations should be in this bucket"
            );
        }