// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 //! Shared utilities for benchmark binaries. use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; use std::time::Duration; // --------------------------------------------------------------------------- // Latency statistics // --------------------------------------------------------------------------- #[derive(Debug, Clone)] pub struct LatencyStats { pub min: Duration, pub max: Duration, pub avg: Duration, pub p50: Duration, pub p95: Duration, pub p99: Duration, pub throughput_ops_sec: f64, } impl LatencyStats { pub fn from_durations(durations: &[Duration]) -> Option { if durations.is_empty() { return None; } let mut sorted = durations.to_vec(); sorted.sort(); let n = sorted.len(); let total: Duration = sorted.iter().sum(); let avg = total / n as u32; Some(Self { min: sorted[0], max: sorted[n - 1], avg, p50: sorted[n / 2], p95: sorted[n * 95 / 100], p99: sorted[n * 99 / 100], throughput_ops_sec: n as f64 / total.as_secs_f64(), }) } /// Print formatted latency statistics to stdout. pub fn print(&self, operation: &str, blocks_per_op: usize) { println!("\n{} Latency Statistics:", operation); println!(" min: {:>12?}", self.min); println!(" avg: {:>12?}", self.avg); println!(" p50: {:>12?}", self.p50); println!(" p95: {:>12?}", self.p95); println!(" p99: {:>12?}", self.p99); println!(" max: {:>12?}", self.max); println!(" throughput: {:.2} ops/sec", self.throughput_ops_sec); println!( " throughput: {:.2} blocks/sec", self.throughput_ops_sec * blocks_per_op as f64 ); } } // --------------------------------------------------------------------------- // Time-bucketed latency statistics // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize)] pub struct TimeBucketStats { pub bucket_start_sec: u64, pub bucket_end_sec: u64, pub count: usize, pub latency_min_us: u64, pub latency_p50_us: u64, pub latency_p95_us: u64, pub latency_max_us: u64, } /// Compute per-bucket latency statistics. /// /// Each item is a `(latency, completion_time)` pair where `completion_time` /// is relative to the measurement start. pub fn compute_time_bucket_stats( items: &[(Duration, Duration)], bucket_size_secs: u64, ) -> Vec { if items.is_empty() || bucket_size_secs == 0 { return Vec::new(); } let max_completion = items .iter() .map(|&(_, ct)| ct) .max() .unwrap_or(Duration::ZERO); let num_buckets = (max_completion.as_secs() / bucket_size_secs) + 1; let mut bucket_latencies: Vec> = vec![Vec::new(); num_buckets as usize]; for &(latency, completion_time) in items { let bucket_idx = (completion_time.as_secs() / bucket_size_secs) as usize; if bucket_idx < bucket_latencies.len() { bucket_latencies[bucket_idx].push(latency); } } bucket_latencies .iter() .enumerate() .filter_map(|(idx, latencies)| { if latencies.is_empty() { return None; } let stats = LatencyStats::from_durations(latencies)?; Some(TimeBucketStats { bucket_start_sec: idx as u64 * bucket_size_secs, bucket_end_sec: (idx as u64 + 1) * bucket_size_secs, count: latencies.len(), latency_min_us: stats.min.as_micros() as u64, latency_p50_us: stats.p50.as_micros() as u64, latency_p95_us: stats.p95.as_micros() as u64, latency_max_us: stats.max.as_micros() as u64, }) }) .collect() } pub fn print_time_bucket_report(buckets: &[TimeBucketStats]) { if buckets.is_empty() { println!(" No time bucket data available"); return; } println!( " {:>8} {:>8} {:>12} {:>12} {:>12} {:>12}", "Time(s)", "Count", "Min(ms)", "P50(ms)", "P95(ms)", "Max(ms)" ); println!(" {}", "-".repeat(68)); for bucket in buckets { println!( " {:>3}-{:<4} {:>8} {:>12.1} {:>12.1} {:>12.1} {:>12.1}", bucket.bucket_start_sec, bucket.bucket_end_sec, bucket.count, bucket.latency_min_us as f64 / 1000.0, bucket.latency_p50_us as f64 / 1000.0, bucket.latency_p95_us as f64 / 1000.0, bucket.latency_max_us as f64 / 1000.0, ); } } // --------------------------------------------------------------------------- // Latency sample (for raw JSON export) // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize)] pub struct LatencySample { pub latency_us: u64, pub completion_time_ms: u64, pub success: bool, } // --------------------------------------------------------------------------- // OpenAI-style chat types // --------------------------------------------------------------------------- #[derive(Debug, Clone, Serialize)] pub struct ChatMessage { pub role: String, pub content: String, } #[derive(Debug, Serialize)] pub struct ChatCompletionRequest { pub model: String, pub messages: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub max_tokens: Option, } // --------------------------------------------------------------------------- // Model auto-detection // --------------------------------------------------------------------------- #[derive(Debug, Deserialize)] struct ModelsResponse { data: Vec, } #[derive(Debug, Deserialize)] struct ModelInfo { id: String, } pub async fn fetch_model_name(frontend_url: &str) -> Result { let client = reqwest::Client::new(); let url = format!("{}/v1/models", frontend_url); println!(" Auto-detecting model from {}...", url); let response = client .get(&url) .send() .await .context("Failed to connect to frontend /v1/models endpoint")?; if !response.status().is_success() { anyhow::bail!("Models endpoint returned status: {}", response.status()); } let models: ModelsResponse = response .json() .await .context("Failed to parse models response")?; match models.data.len() { 0 => anyhow::bail!("No models found at endpoint. Is a backend running?"), 1 => { let model_id = models.data[0].id.clone(); println!(" Auto-detected model: {}", model_id); Ok(model_id) } n => { println!(" Multiple models available ({}):", n); for m in &models.data { println!(" - {}", m.id); } anyhow::bail!("Multiple models available. Please specify --model explicitly.") } } }