// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 #[cfg(feature = "testing-cuda")] mod benchmarks { use std::sync::Arc; use criterion::{BenchmarkId, Criterion, criterion_group}; use cudarc::driver::{CudaContext, CudaStream}; use nixl_sys; use tokio::runtime::Runtime; use tokio_util::task::TaskTracker; use dynamo_llm::block_manager::block::transfer::context; struct BenchmarkRuntime { _runtime: Runtime, handle: tokio::runtime::Handle, stream: Arc, nixl_agent: Arc>, } impl BenchmarkRuntime { fn new() -> Self { let runtime = Runtime::new().expect("Failed to create benchmark runtime"); let handle = runtime.handle().clone(); let cuda_ctx = Arc::new(CudaContext::new(0).expect("Failed to create CUDA context")); let stream = cuda_ctx.default_stream(); let nixl_agent = Arc::new(None); Self { _runtime: runtime, handle, stream, nixl_agent, } } fn create_transfer_context(&self) -> context::v2::TransferContext { context::v2::TransferContext::new( self.nixl_agent.clone(), self.stream.clone(), self.handle.clone(), ) } } /// Benchmark blocking synchronization in tight loop /// This measures the baseline performance of direct CUDA event sync fn bench_blocking(c: &mut Criterion) { let runtime = BenchmarkRuntime::new(); let ctx = runtime.create_transfer_context(); let mut group = c.benchmark_group("blocking_sync"); group.warm_up_time(std::time::Duration::from_millis(500)); group.measurement_time(std::time::Duration::from_secs(3)); group.bench_function("sync", |b| { b.iter(|| { let event = ctx.record_event().unwrap(); event.synchronize_blocking().unwrap(); }) }); group.finish(); } /// Benchmark single-threaded async synchronization /// This measures only the tokio spawn_blocking overhead vs direct blocking fn bench_async_single(c: &mut Criterion) { let runtime = BenchmarkRuntime::new(); let ctx = runtime.create_transfer_context(); let mut group = c.benchmark_group("async_sync"); group.warm_up_time(std::time::Duration::from_millis(500)); group.measurement_time(std::time::Duration::from_secs(3)); group.bench_function("sync", |b| { b.iter(|| { runtime._runtime.block_on(async { let event = ctx.record_event().unwrap(); event.synchronize().await.unwrap(); }) }) }); group.finish(); } /// Benchmark concurrent async synchronization at different scales /// This shows where async becomes beneficial due to parallelism fn bench_concurrent_async(c: &mut Criterion) { let runtime = BenchmarkRuntime::new(); let mut group = c.benchmark_group("concurrent_async"); group.warm_up_time(std::time::Duration::from_millis(500)); group.measurement_time(std::time::Duration::from_secs(3)); // Test different concurrency levels for concurrency in [1, 5, 10, 25, 50, 100].iter() { group.bench_with_input( BenchmarkId::new("concurrent", concurrency), concurrency, |b, &concurrency| { let ctx = runtime.create_transfer_context(); b.iter(|| { runtime._runtime.block_on(async { // Spawn concurrent tasks using TaskTracker let tracker = TaskTracker::new(); for _ in 0..concurrency { let ctx_clone = ctx.clone(); tracker.spawn(async move { let event = ctx_clone.record_event().unwrap(); event.synchronize().await.unwrap(); }); } // Wait for all tasks to complete tracker.close(); tracker.wait().await; }); }); }, ); } group.finish(); } /// Benchmark throughput: events per second at different concurrency levels fn bench_throughput(c: &mut Criterion) { let runtime = BenchmarkRuntime::new(); let mut group = c.benchmark_group("throughput"); group.sample_size(50); // Fewer samples for throughput tests group.warm_up_time(std::time::Duration::from_millis(500)); group.measurement_time(std::time::Duration::from_secs(3)); for concurrency in [1, 10, 50].iter() { let events_per_task = 10; // Process multiple events per task group.bench_with_input( BenchmarkId::new("events_per_sec", concurrency), concurrency, |b, &concurrency| { let ctx = runtime.create_transfer_context(); b.iter(|| { runtime._runtime.block_on(async { let tracker = TaskTracker::new(); for _ in 0..concurrency { let ctx_clone = ctx.clone(); tracker.spawn(async move { // Process multiple events per task for _ in 0..events_per_task { let event = ctx_clone.record_event().unwrap(); event.synchronize().await.unwrap(); } }); } tracker.close(); tracker.wait().await; }); }); }, ); } group.finish(); } criterion_group!( benches, // Core comparison benchmarks bench_blocking, bench_async_single, // Concurrency benchmarks bench_concurrent_async, bench_throughput ); } #[cfg(feature = "testing-cuda")] criterion::criterion_main!(benchmarks::benches); #[cfg(not(feature = "testing-cuda"))] fn main() { println!( "Benchmarks require 'testing-cuda' feature. Run with: cargo bench --features testing-cuda" ); }