"components/vscode:/vscode.git/clone" did not exist on "2f666b73a2a83be094d7ea9febbf044ae05e4279"
Unverified Commit e5e65b58 authored by jthomson04's avatar jthomson04 Committed by GitHub
Browse files

fix: Flaky cuda event test (#7747)


Signed-off-by: default avatarjthomson04 <jwillthomson19@gmail.com>
parent a8894467
...@@ -32,56 +32,3 @@ impl CompletionChecker for CudaEventChecker { ...@@ -32,56 +32,3 @@ impl CompletionChecker for CudaEventChecker {
} }
} }
} }
#[cfg(all(test, feature = "testing-kvbm"))]
mod tests {
use crate::manager::TransferManager;
use crate::transfer::tests::CudaSleep;
use dynamo_memory::nixl::NixlAgent;
use std::time::{Duration, Instant};
#[tokio::test]
async fn test_cuda_event_delayed_notification() {
let agent = NixlAgent::new("test_agent").unwrap();
let manager = TransferManager::builder()
.cuda_device_id(0)
.nixl_agent(agent)
.build()
.unwrap();
let stream = manager.h2d_stream();
let cuda_ctx = manager.cuda_context();
// Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap();
// Test 1: Launch sleep and wait via async notification
let t0_queue_start = Instant::now();
cuda_sleep
.launch(Duration::from_millis(600), stream)
.unwrap();
let queue_time = t0_queue_start.elapsed();
let event = stream.record_event(None).unwrap();
let notification = manager.register_cuda_event(event);
notification.await.unwrap();
let wait_time = t0_queue_start.elapsed() - queue_time;
println!(
"GPU sleep test: queue {:?}, wait {:?}",
queue_time, wait_time
);
assert!(
queue_time < Duration::from_millis(10),
"launching the sleep kernel should be fast: {:?}",
queue_time
);
assert!(
wait_time >= Duration::from_millis(500),
"wait time should reflect >=500ms of GPU work: {:?}",
wait_time
);
}
}
...@@ -56,28 +56,20 @@ mod tests { ...@@ -56,28 +56,20 @@ mod tests {
// Get or create the CudaSleep utility (compiles kernel and calibrates on first use) // Get or create the CudaSleep utility (compiles kernel and calibrates on first use)
let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap(); let cuda_sleep = CudaSleep::for_context(cuda_ctx).unwrap();
// Test 1: Launch sleep and wait via async notification let start = Instant::now();
let t0_queue_start = Instant::now();
cuda_sleep cuda_sleep
.launch(Duration::from_millis(600), stream) .launch(Duration::from_millis(600), stream)
.unwrap(); .unwrap();
let queue_time = t0_queue_start.elapsed();
let event = stream.record_event(None).unwrap(); let event = stream.record_event(None).unwrap();
let notification = manager.register_cuda_event(event); let notification = manager.register_cuda_event(event);
notification.await.unwrap(); tokio::time::timeout(Duration::from_secs(5), notification)
let wait_time = t0_queue_start.elapsed() - queue_time; .await
.expect("notification should complete once the CUDA event signals")
println!( .unwrap();
"GPU sleep test: queue {:?}, wait {:?}", let wait_time = start.elapsed();
queue_time, wait_time
);
assert!( println!("GPU sleep test: total wait {:?}", wait_time);
queue_time < Duration::from_millis(10),
"launching the sleep kernel should be fast: {:?}",
queue_time
);
assert!( assert!(
wait_time >= Duration::from_millis(500), wait_time >= Duration::from_millis(500),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment