Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d784877f
Unverified
Commit
d784877f
authored
May 29, 2025
by
Ryan Olson
Committed by
GitHub
May 29, 2025
Browse files
feat: add critical task execution handle (#1268)
parent
7677f74f
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
587 additions
and
0 deletions
+587
-0
lib/runtime/src/utils.rs
lib/runtime/src/utils.rs
+1
-0
lib/runtime/src/utils/task.rs
lib/runtime/src/utils/task.rs
+586
-0
No files found.
lib/runtime/src/utils.rs
View file @
d784877f
...
...
@@ -17,3 +17,4 @@ pub use tokio::time::{Duration, Instant};
pub
mod
pool
;
pub
mod
stream
;
pub
mod
task
;
lib/runtime/src/utils/task.rs
0 → 100644
View file @
d784877f
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Utilities for handling tasks.
use
anyhow
::{
Context
,
Result
};
use
std
::
future
::
Future
;
use
tokio
::
sync
::
oneshot
;
use
tokio
::
task
::
JoinHandle
;
use
tokio_util
::
sync
::
CancellationToken
;
/// Type alias for a critical task handler function.
///
/// The handler receives a [CancellationToken] and returns a [Future] that resolves to [Result<()>].
/// The task should monitor the cancellation token and gracefully shut down when it's cancelled.
pub
type
CriticalTaskHandler
<
Fut
>
=
dyn
FnOnce
(
CancellationToken
)
->
Fut
+
Send
+
'static
;
/// The [CriticalTaskExecutionHandle] is a handle for a critical task that is expected to
/// complete successfully. This handle provides two cancellation mechanisms:
///
/// 1. **Critical Failure**: If the task returns an error or panics, the parent cancellation
/// token is triggered immediately via a monitoring task that detects failures.
///
/// 2. **Graceful Shutdown**: The task can be gracefully shut down via its child token,
/// allowing it to complete cleanly without triggering system-wide cancellation.
///
/// This is useful for ensuring that critical detached tasks either complete successfully
/// or trigger appropriate shutdown procedures when they fail.
pub
struct
CriticalTaskExecutionHandle
{
monitor_task
:
JoinHandle
<
()
>
,
graceful_shutdown_token
:
CancellationToken
,
result_receiver
:
oneshot
::
Receiver
<
Result
<
()
>>
,
}
impl
CriticalTaskExecutionHandle
{
/// Create a new [CriticalTaskExecutionHandle] for a critical task.
///
/// # Arguments
/// * `task_fn` - A function that takes a cancellation token and returns the critical task future
/// * `parent_token` - Token that will be cancelled if this critical task fails
/// * `description` - Description for logging purposes
pub
async
fn
new
<
Fut
>
(
task_fn
:
impl
FnOnce
(
CancellationToken
)
->
Fut
+
Send
+
'static
,
parent_token
:
CancellationToken
,
description
:
&
str
,
)
->
Result
<
Self
>
where
Fut
:
Future
<
Output
=
Result
<
()
>>
+
Send
+
'static
,
{
let
graceful_shutdown_token
=
parent_token
.child_token
();
let
description
=
description
.to_string
();
let
parent_token_clone
=
parent_token
.clone
();
// Create channel for communicating results from monitor to handle
let
(
result_sender
,
result_receiver
)
=
oneshot
::
channel
();
let
graceful_shutdown_token_clone
=
graceful_shutdown_token
.clone
();
let
description_clone
=
description
.to_string
();
let
task
=
tokio
::
spawn
(
async
move
{
let
future
=
task_fn
(
graceful_shutdown_token_clone
);
match
future
.await
{
Ok
(())
=>
{
tracing
::
debug!
(
"Critical task '{}' completed successfully"
,
description_clone
);
Ok
(())
}
Err
(
e
)
=>
{
tracing
::
error!
(
"Critical task '{}' failed: {:#}"
,
description_clone
,
e
);
Err
(
e
.context
(
format!
(
"Critical task '{}' failed"
,
description_clone
)))
}
}
});
// Spawn monitor task that immediately joins the main task and detects failures
let
monitor_task
=
{
let
main_task_handle
=
task
;
let
parent_token_monitor
=
parent_token_clone
.clone
();
let
description_monitor
=
description
.clone
();
tokio
::
spawn
(
async
move
{
let
result
=
match
main_task_handle
.await
{
Ok
(
task_result
)
=>
{
// Task completed normally (success or error)
if
task_result
.is_err
()
{
// Error - trigger parent cancellation immediately
parent_token_monitor
.cancel
();
}
task_result
}
Err
(
join_error
)
=>
{
// Task panicked - handle immediately
if
join_error
.is_panic
()
{
let
panic_msg
=
if
let
Ok
(
reason
)
=
join_error
.try_into_panic
()
{
if
let
Some
(
s
)
=
reason
.downcast_ref
::
<
String
>
()
{
s
.clone
()
}
else
if
let
Some
(
s
)
=
reason
.downcast_ref
::
<&
str
>
()
{
s
.to_string
()
}
else
{
"Unknown panic"
.to_string
()
}
}
else
{
"Panic occurred but reason unavailable"
.to_string
()
};
tracing
::
error!
(
"Critical task '{}' panicked: {}"
,
description_monitor
,
panic_msg
);
parent_token_monitor
.cancel
();
// Trigger parent cancellation immediately
Err
(
anyhow
::
anyhow!
(
"Critical task '{}' panicked: {}"
,
description_monitor
,
panic_msg
))
}
else
{
parent_token_monitor
.cancel
();
Err
(
anyhow
::
anyhow!
(
"Failed to join critical task '{}': {}"
,
description_monitor
,
join_error
))
}
}
};
// Send result to handle (ignore if receiver dropped)
let
_
=
result_sender
.send
(
result
);
})
};
Ok
(
Self
{
monitor_task
,
graceful_shutdown_token
,
result_receiver
,
})
}
/// Check if the task awaiting on the [Server]s background event loop has finished.
pub
fn
is_finished
(
&
self
)
->
bool
{
self
.monitor_task
.is_finished
()
}
/// Check if the server's event loop has been cancelled.
pub
fn
is_cancelled
(
&
self
)
->
bool
{
self
.graceful_shutdown_token
.is_cancelled
()
}
/// Gracefully cancel this critical task without triggering system-wide shutdown.
///
/// This signals the task to stop processing and exit cleanly. The task should
/// monitor its cancellation token and respond appropriately.
///
/// This will not propagate to the parent [CancellationToken] unless an error
/// occurs during the shutdown process.
pub
fn
cancel
(
&
self
)
{
self
.graceful_shutdown_token
.cancel
();
}
/// Join on the critical task and return its actual result.
///
/// This will return:
/// - `Ok(())` if the task completed successfully or was gracefully cancelled
/// - `Err(...)` if the task failed or panicked, preserving the original error
///
/// Note: Both errors and panics trigger parent cancellation immediately via the monitor task.
pub
async
fn
join
(
self
)
->
Result
<
()
>
{
match
self
.result_receiver
.await
{
Ok
(
task_result
)
=>
task_result
,
Err
(
_
)
=>
{
// This should rarely happen - means monitor task was dropped/cancelled
Err
(
anyhow
::
anyhow!
(
"Critical task monitor was cancelled"
))
}
}
}
}
#[cfg(test)]
mod
tests
{
use
super
::
*
;
use
std
::
sync
::
atomic
::{
AtomicBool
,
AtomicU32
,
Ordering
};
use
std
::
sync
::
Arc
;
use
std
::
time
::
Duration
;
use
tokio
::
time
::
timeout
;
#[tokio::test]
async
fn
test_successful_task_completion
()
{
// Test: A critical task that completes successfully without any issues
// Verifies:
// - Task executes and completes normally
// - Result is Ok(())
// - Parent token remains uncancelled (no critical failure)
// - Task execution side effects occur (work gets done)
let
parent_token
=
CancellationToken
::
new
();
let
completed
=
Arc
::
new
(
AtomicBool
::
new
(
false
));
let
completed_clone
=
completed
.clone
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
completed_clone
.store
(
true
,
Ordering
::
SeqCst
);
Ok
(())
},
parent_token
.clone
(),
"test-success-task"
,
)
.await
.unwrap
();
// Task should complete successfully
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_ok
());
assert
!
(
completed
.load
(
Ordering
::
SeqCst
));
assert
!
(
!
parent_token
.is_cancelled
());
}
#[tokio::test]
async
fn
test_task_failure_cancels_parent_token
()
{
// Test: A critical task that returns an error (critical failure)
// Verifies:
// - Task error is properly propagated to caller
// - Parent cancellation token is triggered (critical failure behavior)
// - Error message is preserved and includes context
// - Demonstrates the "critical" aspect - failures affect the entire system
let
parent_token
=
CancellationToken
::
new
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
anyhow
::
bail!
(
"Critical task failed!"
);
},
parent_token
.clone
(),
"test-failure-task"
,
)
.await
.unwrap
();
// Task should fail and cancel parent token
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_err
());
let
error_msg
=
result
.unwrap_err
()
.to_string
();
// Check that the error contains either the original message or the context
assert
!
(
error_msg
.contains
(
"Critical task failed!"
)
||
error_msg
.contains
(
"Critical task 'test-failure-task' failed"
),
"Error message should contain failure context: {}"
,
error_msg
);
// Give a moment for the cancellation to propagate
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
10
))
.await
;
assert
!
(
parent_token
.is_cancelled
());
}
#[tokio::test]
async
fn
test_task_panic_is_caught_and_reported
()
{
// Test: A critical task that panics during execution
// Verifies:
// - Tokio's JoinHandle catches panics automatically
// - Panics are converted to proper Error types
// - System doesn't crash, panic is contained
// - Error message indicates a panic occurred
// - Parent token is cancelled (panic is treated as critical failure)
// - Demonstrates panic safety of the critical task system
let
parent_token
=
CancellationToken
::
new
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
panic!
(
"Something went terribly wrong!"
);
},
parent_token
.clone
(),
"test-panic-task"
,
)
.await
.unwrap
();
// Panic should be caught and converted to error
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_err
());
let
error_msg
=
result
.unwrap_err
()
.to_string
();
assert
!
(
error_msg
.contains
(
"panicked"
)
||
error_msg
.contains
(
"panic"
),
"Error message should indicate a panic occurred: {}"
,
error_msg
);
// Parent token should be cancelled due to panic (critical failure)
assert
!
(
parent_token
.is_cancelled
());
}
#[tokio::test]
async
fn
test_graceful_shutdown_via_cancellation_token
()
{
// Test: A long-running task that responds to graceful shutdown signals
// Verifies:
// - Task can monitor its cancellation token and stop early
// - Graceful cancellation does NOT trigger parent token cancellation
// - Task can do partial work before stopping
// - handle.cancel() triggers the child token, not parent token
// - Demonstrates proper shutdown patterns for long-running tasks
let
parent_token
=
CancellationToken
::
new
();
let
work_done
=
Arc
::
new
(
AtomicU32
::
new
(
0
));
let
work_done_clone
=
work_done
.clone
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
cancel_token
|
async
move
{
for
i
in
0
..
100
{
if
cancel_token
.is_cancelled
()
{
break
;
}
work_done_clone
.store
(
i
,
Ordering
::
SeqCst
);
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
10
))
.await
;
}
Ok
(())
},
parent_token
.clone
(),
"test-graceful-shutdown"
,
)
.await
.unwrap
();
// Let task do some work
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
50
))
.await
;
// Request graceful shutdown
handle
.cancel
();
// Task should complete gracefully
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_ok
());
// Task should have done some work but not all
let
final_work
=
work_done
.load
(
Ordering
::
SeqCst
);
assert
!
(
final_work
>
0
);
assert
!
(
final_work
<
99
);
// Parent token should NOT be cancelled (graceful shutdown)
assert
!
(
!
parent_token
.is_cancelled
());
}
#[tokio::test]
async
fn
test_multiple_critical_tasks_one_failure
()
{
// Test: Multiple critical tasks sharing a parent token, one fails
// Verifies:
// - Multiple critical tasks can share the same parent cancellation token
// - When one critical task fails, all related tasks receive cancellation signal
// - Tasks can respond to cancellation and stop gracefully
// - System-wide shutdown behavior when critical components fail
// - Demonstrates coordinated shutdown of related services
let
parent_token
=
CancellationToken
::
new
();
let
task1_completed
=
Arc
::
new
(
AtomicBool
::
new
(
false
));
let
task2_completed
=
Arc
::
new
(
AtomicBool
::
new
(
false
));
let
task1_completed_clone
=
task1_completed
.clone
();
let
task2_completed_clone
=
task2_completed
.clone
();
// Start two critical tasks
let
handle1
=
CriticalTaskExecutionHandle
::
new
(
|
cancel_token
|
async
move
{
for
_
in
0
..
50
{
if
cancel_token
.is_cancelled
()
{
return
Ok
(());
}
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
10
))
.await
;
}
task1_completed_clone
.store
(
true
,
Ordering
::
SeqCst
);
Ok
(())
},
parent_token
.clone
(),
"long-running-task"
,
)
.await
.unwrap
();
let
handle2
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
100
))
.await
;
task2_completed_clone
.store
(
true
,
Ordering
::
SeqCst
);
anyhow
::
bail!
(
"Task 2 failed!"
);
},
parent_token
.clone
(),
"failing-task"
,
)
.await
.unwrap
();
// Wait for task 2 to fail
let
result2
=
handle2
.join
()
.await
;
assert
!
(
result2
.is_err
());
// Parent token should be cancelled due to task 2 failure
assert
!
(
parent_token
.is_cancelled
());
// Task 1 should complete early due to cancellation
let
result1
=
handle1
.join
()
.await
;
assert
!
(
result1
.is_ok
());
assert
!
(
!
task1_completed
.load
(
Ordering
::
SeqCst
));
// Should not have completed normally
}
#[tokio::test]
async
fn
test_status_checking_methods
()
{
// Test: Non-blocking status checking methods on the handle
// Verifies:
// - is_finished() accurately reports task completion status
// - is_cancelled() accurately reports cancellation status
// - Status methods work before and after cancellation
// - Methods are non-blocking and can be called multiple times
// - Demonstrates monitoring patterns for task supervision
let
parent_token
=
CancellationToken
::
new
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
cancel_token
|
async
move
{
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
100
))
.await
;
if
cancel_token
.is_cancelled
()
{
return
Ok
(());
}
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
100
))
.await
;
Ok
(())
},
parent_token
,
"status-test-task"
,
)
.await
.unwrap
();
// Initially task should be running
assert
!
(
!
handle
.is_finished
());
assert
!
(
!
handle
.is_cancelled
());
// Cancel the task
handle
.cancel
();
// Task should now be cancelled but may not be finished yet
assert
!
(
handle
.is_cancelled
());
// Wait for completion
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_ok
());
}
#[tokio::test]
async
fn
test_task_with_select_pattern
()
{
// Test: Task using tokio::select! for cancellation-aware operations
// Verifies:
// - Tasks can use idiomatic tokio patterns with cancellation tokens
// - select! allows racing between work and cancellation
// - Cancellation interrupts work immediately, not just at check points
// - Demonstrates recommended pattern for responsive cancellation
// - Shows how to handle cancellation in the middle of async operations
let
parent_token
=
CancellationToken
::
new
();
let
work_completed
=
Arc
::
new
(
AtomicBool
::
new
(
false
));
let
work_completed_clone
=
work_completed
.clone
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
cancel_token
|
async
move
{
tokio
::
select!
{
_
=
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
200
))
=>
{
work_completed_clone
.store
(
true
,
Ordering
::
SeqCst
);
Ok
(())
}
_
=
cancel_token
.cancelled
()
=>
{
// Graceful shutdown requested
Ok
(())
}
}
},
parent_token
,
"select-pattern-task"
,
)
.await
.unwrap
();
// Cancel after a short time
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
50
))
.await
;
handle
.cancel
();
let
result
=
handle
.join
()
.await
;
assert
!
(
result
.is_ok
());
assert
!
(
!
work_completed
.load
(
Ordering
::
SeqCst
));
// Should not have completed the work
}
#[tokio::test]
async
fn
test_timeout_behavior
()
{
// Test: External timeout vs task failure distinction
// Verifies:
// - External timeouts don't trigger parent token cancellation
// - Tasks continue running in background even after timeout
// - Difference between "waiting timeout" and "task failure"
// - Client-side timeout vs server-side failure handling
// - Demonstrates that join() timeout != critical task failure
let
parent_token
=
CancellationToken
::
new
();
let
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
// A task that takes a long time
tokio
::
time
::
sleep
(
Duration
::
from_secs
(
10
))
.await
;
Ok
(())
},
parent_token
,
"long-task"
,
)
.await
.unwrap
();
// Test with timeout
let
result
=
timeout
(
Duration
::
from_millis
(
100
),
handle
.join
())
.await
;
assert
!
(
result
.is_err
());
// Should timeout
// The parent token should NOT be cancelled since task didn't fail
// (it's still running in the background, but we timed out waiting for it)
}
#[tokio::test]
async
fn
test_panic_triggers_immediate_parent_cancellation
()
{
// Test: Verify that panics trigger parent cancellation immediately via monitor task
// Verifies:
// - Monitor task detects panics immediately when they occur
// - Parent token cancellation happens immediately, not on join()
// - System shutdown is triggered as soon as critical task panics
// - Demonstrates true "critical task" behavior with immediate failure propagation
let
parent_token
=
CancellationToken
::
new
();
let
_
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
50
))
.await
;
panic!
(
"Critical failure!"
);
},
parent_token
.clone
(),
"immediate-panic-task"
,
)
.await
.unwrap
();
// Wait for the panic to be detected by monitor task
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
100
))
.await
;
// Parent token should be cancelled immediately via monitor task
assert
!
(
parent_token
.is_cancelled
(),
"Parent token should be cancelled immediately when critical task panics"
);
}
#[tokio::test]
async
fn
test_error_triggers_immediate_parent_cancellation
()
{
// Test: Verify that regular errors also trigger parent cancellation immediately
// Verifies:
// - Parent token cancellation happens immediately when task returns error
// - No need to call join() for critical failure detection
// - Both panics AND regular errors trigger immediate system shutdown
// - Demonstrates consistent critical failure behavior
let
parent_token
=
CancellationToken
::
new
();
let
_
handle
=
CriticalTaskExecutionHandle
::
new
(
|
_
cancel_token
|
async
move
{
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
50
))
.await
;
anyhow
::
bail!
(
"Critical error!"
);
},
parent_token
.clone
(),
"immediate-error-task"
,
)
.await
.unwrap
();
// Don't call join() - just wait for the error to be detected
tokio
::
time
::
sleep
(
Duration
::
from_millis
(
100
))
.await
;
// Parent token should be cancelled even though we didn't call join()
assert
!
(
parent_token
.is_cancelled
(),
"Parent token should be cancelled immediately when critical task errors"
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment