Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
a09ca3ec
Unverified
Commit
a09ca3ec
authored
Jun 13, 2025
by
Jacky
Committed by
GitHub
Jun 13, 2025
Browse files
feat: FT downed worker instance tracking and skipping (#1424)
parent
648740e8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
142 additions
and
53 deletions
+142
-53
lib/runtime/src/component/client.rs
lib/runtime/src/component/client.rs
+55
-1
lib/runtime/src/pipeline/network/egress/push_router.rs
lib/runtime/src/pipeline/network/egress/push_router.rs
+87
-52
No files found.
lib/runtime/src/component/client.rs
View file @
a09ca3ec
...
...
@@ -55,8 +55,10 @@ enum EndpointEvent {
pub
struct
Client
{
// This is me
pub
endpoint
:
Endpoint
,
// These are the remotes I know about
// These are the remotes I know about
from watching etcd
pub
instance_source
:
Arc
<
InstanceSource
>
,
// These are the instances that are reported as down from sending rpc
instance_inhibited
:
Arc
<
Mutex
<
HashMap
<
i64
,
u64
>>>
,
}
#[derive(Clone,
Debug)]
...
...
@@ -65,12 +67,15 @@ pub enum InstanceSource {
Dynamic
(
tokio
::
sync
::
watch
::
Receiver
<
Vec
<
Instance
>>
),
}
// TODO: Avoid returning a full clone of `Vec<Instance>` everytime from Client
// See instances() and instances_avail() methods
impl
Client
{
// Client will only talk to a single static endpoint
pub
(
crate
)
async
fn
new_static
(
endpoint
:
Endpoint
)
->
Result
<
Self
>
{
Ok
(
Client
{
endpoint
,
instance_source
:
Arc
::
new
(
InstanceSource
::
Static
),
instance_inhibited
:
Arc
::
new
(
Mutex
::
new
(
HashMap
::
new
())),
})
}
...
...
@@ -87,6 +92,7 @@ impl Client {
Ok
(
Client
{
endpoint
,
instance_source
,
instance_inhibited
:
Arc
::
new
(
Mutex
::
new
(
HashMap
::
new
())),
})
}
...
...
@@ -99,6 +105,7 @@ impl Client {
self
.endpoint
.etcd_root
()
}
/// Instances available from watching etcd
pub
fn
instances
(
&
self
)
->
Vec
<
Instance
>
{
match
self
.instance_source
.as_ref
()
{
InstanceSource
::
Static
=>
vec!
[],
...
...
@@ -127,6 +134,53 @@ impl Client {
Ok
(
instances
)
}
/// Instances available from watching etcd minus those reported as down
pub
async
fn
instances_avail
(
&
self
)
->
Vec
<
Instance
>
{
// TODO: Can we get the remaining TTL from the lease for the instance?
const
ETCD_LEASE_TTL
:
u64
=
10
;
// seconds
let
now
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap
()
.as_secs
();
let
instances
=
self
.instances
();
let
mut
inhibited
=
self
.instance_inhibited
.lock
()
.await
;
instances
.into_iter
()
.filter_map
(|
instance
|
{
let
id
=
instance
.id
();
if
let
Some
(
&
timestamp
)
=
inhibited
.get
(
&
id
)
{
// If the inhibition is stale, remove it and include the instance
if
now
.saturating_sub
(
timestamp
)
>
ETCD_LEASE_TTL
{
tracing
::
debug!
(
"instance {id} stale inhibition"
);
inhibited
.remove
(
&
id
);
Some
(
instance
)
}
else
{
tracing
::
debug!
(
"instance {id} is inhibited"
);
None
}
}
else
{
tracing
::
debug!
(
"instance {id} not inhibited"
);
Some
(
instance
)
}
})
.collect
()
}
/// Mark an instance as down/unavailable
pub
async
fn
report_instance_down
(
&
self
,
instance_id
:
i64
)
{
let
now
=
std
::
time
::
SystemTime
::
now
()
.duration_since
(
std
::
time
::
UNIX_EPOCH
)
.unwrap
()
.as_secs
();
let
mut
inhibited
=
self
.instance_inhibited
.lock
()
.await
;
inhibited
.insert
(
instance_id
,
now
);
tracing
::
debug!
(
"inhibiting instance {instance_id}"
);
}
/// Is this component know at startup and not discovered via etcd?
pub
fn
is_static
(
&
self
)
->
bool
{
matches!
(
self
.instance_source
.as_ref
(),
InstanceSource
::
Static
)
...
...
lib/runtime/src/pipeline/network/egress/push_router.rs
View file @
a09ca3ec
...
...
@@ -13,10 +13,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use
async_nats
::
client
::{
RequestError
as
NatsRequestError
,
RequestErrorKind
::
NoResponders
as
NatsNoResponders
,
};
use
async_trait
::
async_trait
;
use
rand
::
Rng
;
use
serde
::{
Deserialize
,
Serialize
};
use
std
::{
future
::
Future
,
marker
::
PhantomData
,
sync
::{
atomic
::{
AtomicU64
,
Ordering
},
...
...
@@ -27,7 +31,9 @@ use std::{
use
crate
::{
component
::{
Client
,
Endpoint
,
InstanceSource
},
engine
::{
AsyncEngine
,
Data
},
pipeline
::{
AddressedPushRouter
,
AddressedRequest
,
Error
,
ManyOut
,
SingleIn
},
pipeline
::{
error
::
PipelineErrorExt
,
AddressedPushRouter
,
AddressedRequest
,
Error
,
ManyOut
,
SingleIn
,
},
traits
::
DistributedRuntimeProvider
,
};
...
...
@@ -103,49 +109,52 @@ where
/// Issue a request to the next available instance in a round-robin fashion
pub
async
fn
round_robin
(
&
self
,
request
:
SingleIn
<
T
>
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
let
counter
=
self
.round_robin_counter
.fetch_add
(
1
,
Ordering
::
Relaxed
);
let
slf
=
self
;
let
routing_algorithm
=
move
||
async
move
{
let
counter
=
slf
.round_robin_counter
.fetch_add
(
1
,
Ordering
::
Relaxed
);
let
instance_id
=
{
let
instances
=
slf
.client
.instances_avail
()
.await
;
let
count
=
instances
.len
();
if
count
==
0
{
return
Err
(
anyhow
::
anyhow!
(
"no instances found for endpoint {:?}"
,
slf
.client.endpoint
.etcd_root
()
));
}
let
offset
=
counter
%
count
as
u64
;
instances
[
offset
as
usize
]
.id
()
};
tracing
::
trace!
(
"round robin router selected {instance_id}"
);
let
instance_id
=
{
let
instances
=
self
.client
.instances
();
let
count
=
instances
.len
();
if
count
==
0
{
return
Err
(
anyhow
::
anyhow!
(
"no instances found for endpoint {:?}"
,
self
.client.endpoint
.etcd_root
()
));
}
let
offset
=
counter
%
count
as
u64
;
instances
[
offset
as
usize
]
.id
()
Ok
(
instance_id
)
};
tracing
::
trace!
(
"round robin router selected {instance_id}"
);
let
subject
=
self
.client.endpoint
.subject_to
(
instance_id
);
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
subject
));
self
.addressed
.generate
(
request
)
.await
self
.generate_with_fault_tolerance
(
routing_algorithm
,
request
)
.await
}
/// Issue a request to a random endpoint
pub
async
fn
random
(
&
self
,
request
:
SingleIn
<
T
>
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
let
instance_id
=
{
let
instances
=
self
.client
.instances
();
let
count
=
instances
.len
();
if
count
==
0
{
return
Err
(
anyhow
::
anyhow!
(
"no instances found for endpoint {:?}"
,
self
.client.endpoint
.etcd_root
()
));
}
let
counter
=
rand
::
rng
()
.random
::
<
u64
>
();
let
offset
=
counter
%
count
as
u64
;
instances
[
offset
as
usize
]
.id
()
let
slf
=
self
;
let
routing_algorithm
=
move
||
async
move
{
let
instance_id
=
{
let
instances
=
slf
.client
.instances_avail
()
.await
;
let
count
=
instances
.len
();
if
count
==
0
{
return
Err
(
anyhow
::
anyhow!
(
"no instances found for endpoint {:?}"
,
slf
.client.endpoint
.etcd_root
()
));
}
let
counter
=
rand
::
rng
()
.random
::
<
u64
>
();
let
offset
=
counter
%
count
as
u64
;
instances
[
offset
as
usize
]
.id
()
};
tracing
::
trace!
(
"random router selected {instance_id}"
);
Ok
(
instance_id
)
};
tracing
::
trace!
(
"random router selected {instance_id}"
);
let
subject
=
self
.client.endpoint
.subject_to
(
instance_id
);
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
subject
));
self
.addressed
.generate
(
request
)
.await
self
.generate_with_fault_tolerance
(
routing_algorithm
,
request
)
.await
}
/// Issue a request to a specific endpoint
...
...
@@ -154,22 +163,23 @@ where
request
:
SingleIn
<
T
>
,
instance_id
:
i64
,
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
let
found
=
{
let
instances
=
self
.client
.instances
();
instances
.iter
()
.any
(|
ep
|
ep
.id
()
==
instance_id
)
let
slf
=
self
;
let
routing_algorithm
=
move
||
async
move
{
let
found
=
{
let
instances
=
slf
.client
.instances_avail
()
.await
;
instances
.iter
()
.any
(|
ep
|
ep
.id
()
==
instance_id
)
};
if
!
found
{
return
Err
(
anyhow
::
anyhow!
(
"instance_id={instance_id} not found for endpoint {:?}"
,
slf
.client.endpoint
.etcd_root
()
));
}
Ok
(
instance_id
)
};
if
!
found
{
return
Err
(
anyhow
::
anyhow!
(
"instance_id={instance_id} not found for endpoint {:?}"
,
self
.client.endpoint
.etcd_root
()
));
}
let
subject
=
self
.client.endpoint
.subject_to
(
instance_id
);
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
subject
));
self
.addressed
.generate
(
request
)
.await
self
.generate_with_fault_tolerance
(
routing_algorithm
,
request
)
.await
}
pub
async
fn
r
#
static
(
&
self
,
request
:
SingleIn
<
T
>
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
{
...
...
@@ -179,6 +189,31 @@ where
tracing
::
debug!
(
"router generate"
);
self
.addressed
.generate
(
request
)
.await
}
async
fn
generate_with_fault_tolerance
<
F
,
R
>
(
&
self
,
routing_algorithm
:
F
,
request
:
SingleIn
<
T
>
,
)
->
anyhow
::
Result
<
ManyOut
<
U
>>
where
F
:
FnOnce
()
->
R
,
R
:
Future
<
Output
=
anyhow
::
Result
<
i64
>>
,
{
let
instance_id
=
routing_algorithm
()
.await
?
;
let
subject
=
self
.client.endpoint
.subject_to
(
instance_id
);
let
request
=
request
.map
(|
req
|
AddressedRequest
::
new
(
req
,
subject
));
let
stream
=
self
.addressed
.generate
(
request
)
.await
;
if
let
Some
(
err
)
=
stream
.as_ref
()
.err
()
{
if
let
Some
(
req_err
)
=
err
.downcast_ref
::
<
NatsRequestError
>
()
{
if
matches!
(
req_err
.kind
(),
NatsNoResponders
)
{
self
.client
.report_instance_down
(
instance_id
)
.await
;
}
}
}
stream
}
}
#[async_trait]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment