Unverified Commit 7389a369 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(kv-router): share recovery cursor state (#7596)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent a818a4bd
......@@ -8,6 +8,7 @@
pub mod indexer;
pub mod protocols;
pub mod recovery;
pub mod scheduling;
pub mod sequences;
pub mod zmq_wire;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
/// Shared cursor state for monotonically increasing event streams.
///
/// `InvalidatedByBarrier` represents a semantic stream boundary such as a
/// worker-wide `Cleared` event. After such a barrier, callers must not attempt
/// to recover pre-barrier gaps.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum CursorState {
#[default]
Initial,
Live(u64),
InvalidatedByBarrier(Option<u64>),
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum CursorObservation {
Initial {
got: u64,
},
Contiguous {
got: u64,
},
Gap {
expected: u64,
got: u64,
},
Stale {
got: u64,
last_applied: Option<u64>,
},
FreshAfterBarrier {
got: u64,
last_before_barrier: Option<u64>,
},
}
impl CursorState {
#[must_use]
pub fn last_applied_id(self) -> Option<u64> {
match self {
CursorState::Initial => None,
CursorState::Live(id) => Some(id),
CursorState::InvalidatedByBarrier(last_applied) => last_applied,
}
}
#[must_use]
pub fn observe(self, got: u64) -> CursorObservation {
match self {
CursorState::Initial => CursorObservation::Initial { got },
CursorState::Live(last) if got <= last => CursorObservation::Stale {
got,
last_applied: Some(last),
},
CursorState::Live(last) if got == last + 1 => CursorObservation::Contiguous { got },
CursorState::Live(last) => CursorObservation::Gap {
expected: last + 1,
got,
},
CursorState::InvalidatedByBarrier(last_before_barrier)
if last_before_barrier.is_some_and(|last| got <= last) =>
{
CursorObservation::Stale {
got,
last_applied: last_before_barrier,
}
}
CursorState::InvalidatedByBarrier(last_before_barrier) => {
CursorObservation::FreshAfterBarrier {
got,
last_before_barrier,
}
}
}
}
#[must_use]
pub fn advance_to(self, id: u64) -> Self {
let _ = self;
CursorState::Live(id)
}
#[must_use]
pub fn invalidate_by_barrier(self) -> Self {
CursorState::InvalidatedByBarrier(self.last_applied_id())
}
#[must_use]
pub fn apply_barrier(self, clear_id: u64) -> Self {
let _ = self;
CursorState::Live(clear_id)
}
}
#[cfg(test)]
mod tests {
use super::{CursorObservation, CursorState};
#[test]
fn initial_observation_preserves_first_id() {
assert_eq!(
CursorState::Initial.observe(0),
CursorObservation::Initial { got: 0 }
);
assert_eq!(
CursorState::Initial.observe(5),
CursorObservation::Initial { got: 5 }
);
}
#[test]
fn live_observation_detects_contiguous_gap_and_stale_ids() {
assert_eq!(
CursorState::Live(10).observe(11),
CursorObservation::Contiguous { got: 11 }
);
assert_eq!(
CursorState::Live(10).observe(15),
CursorObservation::Gap {
expected: 11,
got: 15,
}
);
assert_eq!(
CursorState::Live(10).observe(10),
CursorObservation::Stale {
got: 10,
last_applied: Some(10),
}
);
assert_eq!(
CursorState::Live(10).observe(9),
CursorObservation::Stale {
got: 9,
last_applied: Some(10),
}
);
}
#[test]
fn barrier_invalidation_preserves_last_applied_id() {
assert_eq!(
CursorState::Live(17).invalidate_by_barrier(),
CursorState::InvalidatedByBarrier(Some(17))
);
assert_eq!(
CursorState::InvalidatedByBarrier(Some(17)).observe(16),
CursorObservation::Stale {
got: 16,
last_applied: Some(17),
}
);
assert_eq!(
CursorState::InvalidatedByBarrier(Some(17)).observe(20),
CursorObservation::FreshAfterBarrier {
got: 20,
last_before_barrier: Some(17),
}
);
}
#[test]
fn apply_barrier_and_advance_restore_live_cursor() {
assert_eq!(
CursorState::Initial.apply_barrier(20),
CursorState::Live(20)
);
assert_eq!(CursorState::Initial.advance_to(7), CursorState::Live(7));
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
mod cursor;
pub use cursor::{CursorObservation, CursorState};
......@@ -298,6 +298,7 @@ impl ListenerRecord {
}
}
#[allow(dead_code)]
fn status(&self) -> ListenerStatus {
self.runtime.lock().status
}
......
......@@ -25,6 +25,7 @@ use crate::kv_router::worker_kv_indexer_query_endpoint;
use dynamo_kv_router::{
indexer::{LocalKvIndexer, WorkerKvQueryRequest, WorkerKvQueryResponse},
protocols::{DpRank, KvCacheEventData, RouterEvent, WorkerId},
recovery::{CursorObservation, CursorState},
};
// Recovery retry configuration
......@@ -37,28 +38,16 @@ const QUERY_ENDPOINT_PREFIX: &str = "worker_kv_indexer_query_dp";
type RecoveryKey = (WorkerId, DpRank);
#[derive(Clone, Copy, Debug, Default)]
enum RankCursor {
#[default]
NeedsRestore,
Live(u64),
InvalidatedByBarrier(Option<u64>),
}
#[derive(Debug, Default)]
struct RankState {
cursor: RankCursor,
cursor: CursorState,
max_seen_live_id: Option<u64>,
recovery_inflight: bool,
}
impl RankState {
fn last_applied_id(&self) -> Option<u64> {
match self.cursor {
RankCursor::NeedsRestore => None,
RankCursor::Live(event_id) => Some(event_id),
RankCursor::InvalidatedByBarrier(last_applied_id) => last_applied_id,
}
self.cursor.last_applied_id()
}
fn observe_live_id(&mut self, event_id: u64) {
......@@ -301,9 +290,7 @@ impl WorkerQueryClient {
let spawn = {
let mut worker_state = worker_state.lock().await;
let rank_state = worker_state.ranks.entry(dp_rank).or_default();
if matches!(rank_state.cursor, RankCursor::NeedsRestore)
&& !rank_state.recovery_inflight
{
if matches!(rank_state.cursor, CursorState::Initial) && !rank_state.recovery_inflight {
tracing::info!(
"WorkerQueryClient: discovered worker {worker_id} dp_rank {dp_rank}, scheduling restore"
);
......@@ -350,13 +337,13 @@ impl WorkerQueryClient {
worker_state.epoch += 1;
for rank_state in worker_state.ranks.values_mut() {
rank_state.cursor = RankCursor::InvalidatedByBarrier(rank_state.last_applied_id());
rank_state.cursor = rank_state.cursor.invalidate_by_barrier();
rank_state.max_seen_live_id = None;
rank_state.recovery_inflight = false;
}
let rank_state = worker_state.ranks.entry(clear_dp_rank).or_default();
rank_state.cursor = RankCursor::Live(clear_event_id);
rank_state.cursor = rank_state.cursor.apply_barrier(clear_event_id);
tracing::info!(
"Applying clear barrier for worker {worker_id}; invalidating recovery across {} dp_ranks",
......@@ -394,63 +381,43 @@ impl WorkerQueryClient {
}
// Already applied the event, so no further action needed.
return;
} else {
match rank_state.cursor {
// We have never established a cursor for this rank, so live traffic only tells
// us how far ahead the stream has moved while a full restore catches up.
RankCursor::NeedsRestore => {
rank_state.observe_live_id(event_id);
if !rank_state.recovery_inflight {
rank_state.recovery_inflight = true;
Action::SpawnFullRestore {
epoch: worker_state.epoch,
}
} else {
// A recovery is already in flight. Nothing to do.
return;
}
}
// Normal steady-state path: apply contiguous events directly, but coalesce any
// gap into a single recovery pass using `max_seen_live_id` as the high-water mark.
RankCursor::Live(last_applied_id) => {
if event_id <= last_applied_id {
// We've already applied this event. Nothing to do.
return;
} else if rank_state.recovery_inflight {
// A recovery is already in flight. Drop the event for now, and potentially spawn a new recovery afterwards.
rank_state.observe_live_id(event_id);
return;
} else if event_id > last_applied_id.saturating_add(1) {
// We've detected a gap. Spawn a new recovery pass.
}
match rank_state.cursor.observe(event_id) {
CursorObservation::Stale { .. } => return,
observation if rank_state.recovery_inflight => {
match observation {
CursorObservation::Initial { .. }
| CursorObservation::Contiguous { .. }
| CursorObservation::Gap { .. }
| CursorObservation::FreshAfterBarrier { .. } => {
rank_state.observe_live_id(event_id);
rank_state.recovery_inflight = true;
Action::SpawnIncremental {
epoch: worker_state.epoch,
start_event_id: last_applied_id.saturating_add(1),
}
} else {
// Apply the event.
rank_state.cursor = RankCursor::Live(event_id);
rank_state.clear_max_seen_if_caught_up(event_id);
Action::ApplyDirect
}
CursorObservation::Stale { .. } => {}
}
// A worker-wide barrier (currently `Cleared`) invalidated this rank's old
// cursor. The next newer live event becomes the new starting point; we do not
// recover across the barrier.
RankCursor::InvalidatedByBarrier(last_applied_id) => {
if last_applied_id
.is_some_and(|last_applied_id| event_id <= last_applied_id)
{
return;
} else {
rank_state.cursor = RankCursor::Live(event_id);
rank_state.max_seen_live_id = None;
rank_state.recovery_inflight = false;
Action::ApplyDirect
}
return;
}
CursorObservation::Initial { .. } => {
rank_state.observe_live_id(event_id);
rank_state.recovery_inflight = true;
Action::SpawnFullRestore {
epoch: worker_state.epoch,
}
}
CursorObservation::Gap { expected, .. } => {
rank_state.observe_live_id(event_id);
rank_state.recovery_inflight = true;
Action::SpawnIncremental {
epoch: worker_state.epoch,
start_event_id: expected,
}
}
CursorObservation::Contiguous { got }
| CursorObservation::FreshAfterBarrier { got, .. } => {
rank_state.cursor = rank_state.cursor.advance_to(got);
rank_state.clear_max_seen_if_caught_up(got);
Action::ApplyDirect
}
}
};
......@@ -531,12 +498,12 @@ impl WorkerQueryClient {
if matches!(&event.event.data, KvCacheEventData::Cleared) {
self.apply_worker_clear_locked(&mut worker_state, event)
.await;
new_cursor = RankCursor::Live(event_id);
new_cursor = new_cursor.apply_barrier(event_id);
saw_clear = true;
continue;
}
self.indexer.apply_event(event).await;
new_cursor = RankCursor::Live(event_id);
new_cursor = new_cursor.advance_to(event_id);
}
successful_response = true;
}
......@@ -554,16 +521,14 @@ impl WorkerQueryClient {
for event in &events {
self.indexer.apply_event(event.clone()).await;
}
new_cursor = RankCursor::Live(last_event_id);
new_cursor = new_cursor.advance_to(last_event_id);
successful_response = true;
}
Ok(WorkerKvQueryResponse::TooNew {
requested_start,
requested_end,
newest_available,
newest_available, ..
}) => {
tracing::warn!(
"Requested range [{requested_start:?}, {requested_end:?}] is newer than available (newest: {newest_available}) for worker {} dp_rank {}",
"Requested recovery is newer than available (newest: {newest_available}) for worker {} dp_rank {}",
key.0,
key.1
);
......@@ -803,6 +768,10 @@ mod tests {
fn call_count(&self) -> usize {
self.calls.lock().unwrap().len()
}
fn calls(&self) -> Vec<(RecoveryKey, Option<u64>, Option<u64>)> {
self.calls.lock().unwrap().clone()
}
}
#[async_trait]
......@@ -1043,7 +1012,7 @@ mod tests {
{
let worker_state = client.get_or_create_worker_state(key.0);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(key.1).or_default().cursor = RankCursor::Live(10);
worker_state.ranks.entry(key.1).or_default().cursor = CursorState::Live(10);
}
let first_started = Arc::new(Notify::new());
......@@ -1082,6 +1051,10 @@ mod tests {
})
})
.await;
assert_eq!(
transport.calls(),
vec![(key, Some(11), None), (key, Some(16), None)]
);
kv_indexer.flush().await;
let events = kv_indexer.dump_events().await.unwrap();
......@@ -1159,13 +1132,13 @@ mod tests {
{
let worker_state = client.get_or_create_worker_state(delayed_key.0);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(delayed_key.1).or_default().cursor = RankCursor::Live(10);
worker_state.ranks.entry(delayed_key.1).or_default().cursor = CursorState::Live(10);
}
let other_key = (2, 0);
{
let worker_state = client.get_or_create_worker_state(other_key.0);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(other_key.1).or_default().cursor = RankCursor::Live(20);
worker_state.ranks.entry(other_key.1).or_default().cursor = CursorState::Live(20);
}
let started = Arc::new(Notify::new());
......@@ -1210,7 +1183,7 @@ mod tests {
{
let worker_state = client.get_or_create_worker_state(key.0);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(key.1).or_default().cursor = RankCursor::Live(10);
worker_state.ranks.entry(key.1).or_default().cursor = CursorState::Live(10);
}
let started = Arc::new(Notify::new());
......@@ -1247,8 +1220,8 @@ mod tests {
{
let worker_state = client.get_or_create_worker_state(1);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(0).or_default().cursor = RankCursor::Live(10);
worker_state.ranks.entry(1).or_default().cursor = RankCursor::Live(20);
worker_state.ranks.entry(0).or_default().cursor = CursorState::Live(10);
worker_state.ranks.entry(1).or_default().cursor = CursorState::Live(20);
}
let started = Arc::new(Notify::new());
......@@ -1280,6 +1253,9 @@ mod tests {
})
})
.await;
assert!(rank_state_matches(&client, key1, |state| {
matches!(state.cursor, CursorState::InvalidatedByBarrier(Some(20)))
}));
client.handle_live_event(make_store_event(1, 0, 15)).await;
client.handle_live_event(make_store_event(1, 1, 30)).await;
......@@ -1300,8 +1276,8 @@ mod tests {
{
let worker_state = client.get_or_create_worker_state(1);
let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(0).or_default().cursor = RankCursor::Live(10);
worker_state.ranks.entry(1).or_default().cursor = RankCursor::Live(20);
worker_state.ranks.entry(0).or_default().cursor = CursorState::Live(10);
worker_state.ranks.entry(1).or_default().cursor = CursorState::Live(20);
}
transport.push_action(
......@@ -1327,6 +1303,9 @@ mod tests {
})
})
.await;
assert!(rank_state_matches(&client, key1, |state| {
matches!(state.cursor, CursorState::InvalidatedByBarrier(Some(20)))
}));
assert_eq!(transport.call_count(), 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment