Unverified Commit e5e65b58 authored by jthomson04's avatar jthomson04 Committed by GitHub
Browse files

fix: Flaky cuda event test (#7747)


Signed-off-by: default avatarjthomson04 <jwillthomson19@gmail.com>
parent a8894467
......@@ -32,56 +32,3 @@ impl CompletionChecker for CudaEventChecker {
}
}
}
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use crate::manager::TransferManager;
use crate::transfer::tests::CudaSleep;
use dynamo_memory::nixl::NixlAgent;
use std::time::{Duration, Instant};
#[tokio::test]
async fn test_cuda_event_delayed_notification() {
let agent = NixlAgent::new("test_agent").unwrap();
let manager = TransferManager::builder()
.cuda_device_id(0)
.nixl_agent(agent)
.build()
.unwrap();
let stream = manager.h2d_stream();
let cuda_ctx = manager.cuda_context();
// Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap();
// Test 1: Launch sleep and wait via async notification
let t0_queue_start = Instant::now();
cuda_sleep
.launch(Duration::from_millis(600), stream)
.unwrap();
let queue_time = t0_queue_start.elapsed();
let event = stream.record_event(None).unwrap();
let notification = manager.register_cuda_event(event);
notification.await.unwrap();
let wait_time = t0_queue_start.elapsed() - queue_time;
println!(
"GPU sleep test: queue {:?}, wait {:?}",
queue_time, wait_time
);
assert!(
queue_time < Duration::from_millis(10),
"launching the sleep kernel should be fast: {:?}",
queue_time
);
assert!(
wait_time >= Duration::from_millis(500),
"wait time should reflect >=500ms of GPU work: {:?}",
wait_time
);
}
}
......@@ -56,28 +56,20 @@ mod tests {
// Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap();
// Test 1: Launch sleep and wait via async notification
let t0_queue_start = Instant::now();
let start = Instant::now();
cuda_sleep
.launch(Duration::from_millis(600), stream)
.unwrap();
let queue_time = t0_queue_start.elapsed();
let event = stream.record_event(None).unwrap();
let notification = manager.register_cuda_event(event);
notification.await.unwrap();
let wait_time = t0_queue_start.elapsed() - queue_time;
println!(
"GPU sleep test: queue {:?}, wait {:?}",
queue_time, wait_time
);
tokio::time::timeout(Duration::from_secs(5), notification)
.await
.expect("notification should complete once the CUDA event signals")
.unwrap();
let wait_time = start.elapsed();
assert!(
queue_time < Duration::from_millis(10),
"launching the sleep kernel should be fast: {:?}",
queue_time
);
println!("GPU sleep test: total wait {:?}", wait_time);
assert!(
wait_time >= Duration::from_millis(500),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment