Unverified Commit 7389a369 authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

chore(kv-router): share recovery cursor state (#7596)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent a818a4bd
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
pub mod indexer; pub mod indexer;
pub mod protocols; pub mod protocols;
pub mod recovery;
pub mod scheduling; pub mod scheduling;
pub mod sequences; pub mod sequences;
pub mod zmq_wire; pub mod zmq_wire;
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
/// Shared cursor state for monotonically increasing event streams.
///
/// `InvalidatedByBarrier` represents a semantic stream boundary such as a
/// worker-wide `Cleared` event. After such a barrier, callers must not attempt
/// to recover pre-barrier gaps.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum CursorState {
#[default]
Initial,
Live(u64),
InvalidatedByBarrier(Option<u64>),
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum CursorObservation {
Initial {
got: u64,
},
Contiguous {
got: u64,
},
Gap {
expected: u64,
got: u64,
},
Stale {
got: u64,
last_applied: Option<u64>,
},
FreshAfterBarrier {
got: u64,
last_before_barrier: Option<u64>,
},
}
impl CursorState {
#[must_use]
pub fn last_applied_id(self) -> Option<u64> {
match self {
CursorState::Initial => None,
CursorState::Live(id) => Some(id),
CursorState::InvalidatedByBarrier(last_applied) => last_applied,
}
}
#[must_use]
pub fn observe(self, got: u64) -> CursorObservation {
match self {
CursorState::Initial => CursorObservation::Initial { got },
CursorState::Live(last) if got <= last => CursorObservation::Stale {
got,
last_applied: Some(last),
},
CursorState::Live(last) if got == last + 1 => CursorObservation::Contiguous { got },
CursorState::Live(last) => CursorObservation::Gap {
expected: last + 1,
got,
},
CursorState::InvalidatedByBarrier(last_before_barrier)
if last_before_barrier.is_some_and(|last| got <= last) =>
{
CursorObservation::Stale {
got,
last_applied: last_before_barrier,
}
}
CursorState::InvalidatedByBarrier(last_before_barrier) => {
CursorObservation::FreshAfterBarrier {
got,
last_before_barrier,
}
}
}
}
#[must_use]
pub fn advance_to(self, id: u64) -> Self {
let _ = self;
CursorState::Live(id)
}
#[must_use]
pub fn invalidate_by_barrier(self) -> Self {
CursorState::InvalidatedByBarrier(self.last_applied_id())
}
#[must_use]
pub fn apply_barrier(self, clear_id: u64) -> Self {
let _ = self;
CursorState::Live(clear_id)
}
}
#[cfg(test)]
mod tests {
use super::{CursorObservation, CursorState};
#[test]
fn initial_observation_preserves_first_id() {
assert_eq!(
CursorState::Initial.observe(0),
CursorObservation::Initial { got: 0 }
);
assert_eq!(
CursorState::Initial.observe(5),
CursorObservation::Initial { got: 5 }
);
}
#[test]
fn live_observation_detects_contiguous_gap_and_stale_ids() {
assert_eq!(
CursorState::Live(10).observe(11),
CursorObservation::Contiguous { got: 11 }
);
assert_eq!(
CursorState::Live(10).observe(15),
CursorObservation::Gap {
expected: 11,
got: 15,
}
);
assert_eq!(
CursorState::Live(10).observe(10),
CursorObservation::Stale {
got: 10,
last_applied: Some(10),
}
);
assert_eq!(
CursorState::Live(10).observe(9),
CursorObservation::Stale {
got: 9,
last_applied: Some(10),
}
);
}
#[test]
fn barrier_invalidation_preserves_last_applied_id() {
assert_eq!(
CursorState::Live(17).invalidate_by_barrier(),
CursorState::InvalidatedByBarrier(Some(17))
);
assert_eq!(
CursorState::InvalidatedByBarrier(Some(17)).observe(16),
CursorObservation::Stale {
got: 16,
last_applied: Some(17),
}
);
assert_eq!(
CursorState::InvalidatedByBarrier(Some(17)).observe(20),
CursorObservation::FreshAfterBarrier {
got: 20,
last_before_barrier: Some(17),
}
);
}
#[test]
fn apply_barrier_and_advance_restore_live_cursor() {
assert_eq!(
CursorState::Initial.apply_barrier(20),
CursorState::Live(20)
);
assert_eq!(CursorState::Initial.advance_to(7), CursorState::Live(7));
}
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
mod cursor;
pub use cursor::{CursorObservation, CursorState};
...@@ -298,6 +298,7 @@ impl ListenerRecord { ...@@ -298,6 +298,7 @@ impl ListenerRecord {
} }
} }
#[allow(dead_code)]
fn status(&self) -> ListenerStatus { fn status(&self) -> ListenerStatus {
self.runtime.lock().status self.runtime.lock().status
} }
......
...@@ -25,6 +25,7 @@ use crate::kv_router::worker_kv_indexer_query_endpoint; ...@@ -25,6 +25,7 @@ use crate::kv_router::worker_kv_indexer_query_endpoint;
use dynamo_kv_router::{ use dynamo_kv_router::{
indexer::{LocalKvIndexer, WorkerKvQueryRequest, WorkerKvQueryResponse}, indexer::{LocalKvIndexer, WorkerKvQueryRequest, WorkerKvQueryResponse},
protocols::{DpRank, KvCacheEventData, RouterEvent, WorkerId}, protocols::{DpRank, KvCacheEventData, RouterEvent, WorkerId},
recovery::{CursorObservation, CursorState},
}; };
// Recovery retry configuration // Recovery retry configuration
...@@ -37,28 +38,16 @@ const QUERY_ENDPOINT_PREFIX: &str = "worker_kv_indexer_query_dp"; ...@@ -37,28 +38,16 @@ const QUERY_ENDPOINT_PREFIX: &str = "worker_kv_indexer_query_dp";
type RecoveryKey = (WorkerId, DpRank); type RecoveryKey = (WorkerId, DpRank);
#[derive(Clone, Copy, Debug, Default)]
enum RankCursor {
#[default]
NeedsRestore,
Live(u64),
InvalidatedByBarrier(Option<u64>),
}
#[derive(Debug, Default)] #[derive(Debug, Default)]
struct RankState { struct RankState {
cursor: RankCursor, cursor: CursorState,
max_seen_live_id: Option<u64>, max_seen_live_id: Option<u64>,
recovery_inflight: bool, recovery_inflight: bool,
} }
impl RankState { impl RankState {
fn last_applied_id(&self) -> Option<u64> { fn last_applied_id(&self) -> Option<u64> {
match self.cursor { self.cursor.last_applied_id()
RankCursor::NeedsRestore => None,
RankCursor::Live(event_id) => Some(event_id),
RankCursor::InvalidatedByBarrier(last_applied_id) => last_applied_id,
}
} }
fn observe_live_id(&mut self, event_id: u64) { fn observe_live_id(&mut self, event_id: u64) {
...@@ -301,9 +290,7 @@ impl WorkerQueryClient { ...@@ -301,9 +290,7 @@ impl WorkerQueryClient {
let spawn = { let spawn = {
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
let rank_state = worker_state.ranks.entry(dp_rank).or_default(); let rank_state = worker_state.ranks.entry(dp_rank).or_default();
if matches!(rank_state.cursor, RankCursor::NeedsRestore) if matches!(rank_state.cursor, CursorState::Initial) && !rank_state.recovery_inflight {
&& !rank_state.recovery_inflight
{
tracing::info!( tracing::info!(
"WorkerQueryClient: discovered worker {worker_id} dp_rank {dp_rank}, scheduling restore" "WorkerQueryClient: discovered worker {worker_id} dp_rank {dp_rank}, scheduling restore"
); );
...@@ -350,13 +337,13 @@ impl WorkerQueryClient { ...@@ -350,13 +337,13 @@ impl WorkerQueryClient {
worker_state.epoch += 1; worker_state.epoch += 1;
for rank_state in worker_state.ranks.values_mut() { for rank_state in worker_state.ranks.values_mut() {
rank_state.cursor = RankCursor::InvalidatedByBarrier(rank_state.last_applied_id()); rank_state.cursor = rank_state.cursor.invalidate_by_barrier();
rank_state.max_seen_live_id = None; rank_state.max_seen_live_id = None;
rank_state.recovery_inflight = false; rank_state.recovery_inflight = false;
} }
let rank_state = worker_state.ranks.entry(clear_dp_rank).or_default(); let rank_state = worker_state.ranks.entry(clear_dp_rank).or_default();
rank_state.cursor = RankCursor::Live(clear_event_id); rank_state.cursor = rank_state.cursor.apply_barrier(clear_event_id);
tracing::info!( tracing::info!(
"Applying clear barrier for worker {worker_id}; invalidating recovery across {} dp_ranks", "Applying clear barrier for worker {worker_id}; invalidating recovery across {} dp_ranks",
...@@ -394,63 +381,43 @@ impl WorkerQueryClient { ...@@ -394,63 +381,43 @@ impl WorkerQueryClient {
} }
// Already applied the event, so no further action needed. // Already applied the event, so no further action needed.
return; return;
} else { }
match rank_state.cursor {
// We have never established a cursor for this rank, so live traffic only tells match rank_state.cursor.observe(event_id) {
// us how far ahead the stream has moved while a full restore catches up. CursorObservation::Stale { .. } => return,
RankCursor::NeedsRestore => { observation if rank_state.recovery_inflight => {
rank_state.observe_live_id(event_id); match observation {
if !rank_state.recovery_inflight { CursorObservation::Initial { .. }
rank_state.recovery_inflight = true; | CursorObservation::Contiguous { .. }
Action::SpawnFullRestore { | CursorObservation::Gap { .. }
epoch: worker_state.epoch, | CursorObservation::FreshAfterBarrier { .. } => {
}
} else {
// A recovery is already in flight. Nothing to do.
return;
}
}
// Normal steady-state path: apply contiguous events directly, but coalesce any
// gap into a single recovery pass using `max_seen_live_id` as the high-water mark.
RankCursor::Live(last_applied_id) => {
if event_id <= last_applied_id {
// We've already applied this event. Nothing to do.
return;
} else if rank_state.recovery_inflight {
// A recovery is already in flight. Drop the event for now, and potentially spawn a new recovery afterwards.
rank_state.observe_live_id(event_id);
return;
} else if event_id > last_applied_id.saturating_add(1) {
// We've detected a gap. Spawn a new recovery pass.
rank_state.observe_live_id(event_id); rank_state.observe_live_id(event_id);
rank_state.recovery_inflight = true;
Action::SpawnIncremental {
epoch: worker_state.epoch,
start_event_id: last_applied_id.saturating_add(1),
}
} else {
// Apply the event.
rank_state.cursor = RankCursor::Live(event_id);
rank_state.clear_max_seen_if_caught_up(event_id);
Action::ApplyDirect
} }
CursorObservation::Stale { .. } => {}
} }
// A worker-wide barrier (currently `Cleared`) invalidated this rank's old return;
// cursor. The next newer live event becomes the new starting point; we do not }
// recover across the barrier. CursorObservation::Initial { .. } => {
RankCursor::InvalidatedByBarrier(last_applied_id) => { rank_state.observe_live_id(event_id);
if last_applied_id rank_state.recovery_inflight = true;
.is_some_and(|last_applied_id| event_id <= last_applied_id) Action::SpawnFullRestore {
{ epoch: worker_state.epoch,
return; }
} else { }
rank_state.cursor = RankCursor::Live(event_id); CursorObservation::Gap { expected, .. } => {
rank_state.max_seen_live_id = None; rank_state.observe_live_id(event_id);
rank_state.recovery_inflight = false; rank_state.recovery_inflight = true;
Action::ApplyDirect Action::SpawnIncremental {
} epoch: worker_state.epoch,
start_event_id: expected,
} }
} }
CursorObservation::Contiguous { got }
| CursorObservation::FreshAfterBarrier { got, .. } => {
rank_state.cursor = rank_state.cursor.advance_to(got);
rank_state.clear_max_seen_if_caught_up(got);
Action::ApplyDirect
}
} }
}; };
...@@ -531,12 +498,12 @@ impl WorkerQueryClient { ...@@ -531,12 +498,12 @@ impl WorkerQueryClient {
if matches!(&event.event.data, KvCacheEventData::Cleared) { if matches!(&event.event.data, KvCacheEventData::Cleared) {
self.apply_worker_clear_locked(&mut worker_state, event) self.apply_worker_clear_locked(&mut worker_state, event)
.await; .await;
new_cursor = RankCursor::Live(event_id); new_cursor = new_cursor.apply_barrier(event_id);
saw_clear = true; saw_clear = true;
continue; continue;
} }
self.indexer.apply_event(event).await; self.indexer.apply_event(event).await;
new_cursor = RankCursor::Live(event_id); new_cursor = new_cursor.advance_to(event_id);
} }
successful_response = true; successful_response = true;
} }
...@@ -554,16 +521,14 @@ impl WorkerQueryClient { ...@@ -554,16 +521,14 @@ impl WorkerQueryClient {
for event in &events { for event in &events {
self.indexer.apply_event(event.clone()).await; self.indexer.apply_event(event.clone()).await;
} }
new_cursor = RankCursor::Live(last_event_id); new_cursor = new_cursor.advance_to(last_event_id);
successful_response = true; successful_response = true;
} }
Ok(WorkerKvQueryResponse::TooNew { Ok(WorkerKvQueryResponse::TooNew {
requested_start, newest_available, ..
requested_end,
newest_available,
}) => { }) => {
tracing::warn!( tracing::warn!(
"Requested range [{requested_start:?}, {requested_end:?}] is newer than available (newest: {newest_available}) for worker {} dp_rank {}", "Requested recovery is newer than available (newest: {newest_available}) for worker {} dp_rank {}",
key.0, key.0,
key.1 key.1
); );
...@@ -803,6 +768,10 @@ mod tests { ...@@ -803,6 +768,10 @@ mod tests {
fn call_count(&self) -> usize { fn call_count(&self) -> usize {
self.calls.lock().unwrap().len() self.calls.lock().unwrap().len()
} }
fn calls(&self) -> Vec<(RecoveryKey, Option<u64>, Option<u64>)> {
self.calls.lock().unwrap().clone()
}
} }
#[async_trait] #[async_trait]
...@@ -1043,7 +1012,7 @@ mod tests { ...@@ -1043,7 +1012,7 @@ mod tests {
{ {
let worker_state = client.get_or_create_worker_state(key.0); let worker_state = client.get_or_create_worker_state(key.0);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(key.1).or_default().cursor = RankCursor::Live(10); worker_state.ranks.entry(key.1).or_default().cursor = CursorState::Live(10);
} }
let first_started = Arc::new(Notify::new()); let first_started = Arc::new(Notify::new());
...@@ -1082,6 +1051,10 @@ mod tests { ...@@ -1082,6 +1051,10 @@ mod tests {
}) })
}) })
.await; .await;
assert_eq!(
transport.calls(),
vec![(key, Some(11), None), (key, Some(16), None)]
);
kv_indexer.flush().await; kv_indexer.flush().await;
let events = kv_indexer.dump_events().await.unwrap(); let events = kv_indexer.dump_events().await.unwrap();
...@@ -1159,13 +1132,13 @@ mod tests { ...@@ -1159,13 +1132,13 @@ mod tests {
{ {
let worker_state = client.get_or_create_worker_state(delayed_key.0); let worker_state = client.get_or_create_worker_state(delayed_key.0);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(delayed_key.1).or_default().cursor = RankCursor::Live(10); worker_state.ranks.entry(delayed_key.1).or_default().cursor = CursorState::Live(10);
} }
let other_key = (2, 0); let other_key = (2, 0);
{ {
let worker_state = client.get_or_create_worker_state(other_key.0); let worker_state = client.get_or_create_worker_state(other_key.0);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(other_key.1).or_default().cursor = RankCursor::Live(20); worker_state.ranks.entry(other_key.1).or_default().cursor = CursorState::Live(20);
} }
let started = Arc::new(Notify::new()); let started = Arc::new(Notify::new());
...@@ -1210,7 +1183,7 @@ mod tests { ...@@ -1210,7 +1183,7 @@ mod tests {
{ {
let worker_state = client.get_or_create_worker_state(key.0); let worker_state = client.get_or_create_worker_state(key.0);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(key.1).or_default().cursor = RankCursor::Live(10); worker_state.ranks.entry(key.1).or_default().cursor = CursorState::Live(10);
} }
let started = Arc::new(Notify::new()); let started = Arc::new(Notify::new());
...@@ -1247,8 +1220,8 @@ mod tests { ...@@ -1247,8 +1220,8 @@ mod tests {
{ {
let worker_state = client.get_or_create_worker_state(1); let worker_state = client.get_or_create_worker_state(1);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(0).or_default().cursor = RankCursor::Live(10); worker_state.ranks.entry(0).or_default().cursor = CursorState::Live(10);
worker_state.ranks.entry(1).or_default().cursor = RankCursor::Live(20); worker_state.ranks.entry(1).or_default().cursor = CursorState::Live(20);
} }
let started = Arc::new(Notify::new()); let started = Arc::new(Notify::new());
...@@ -1280,6 +1253,9 @@ mod tests { ...@@ -1280,6 +1253,9 @@ mod tests {
}) })
}) })
.await; .await;
assert!(rank_state_matches(&client, key1, |state| {
matches!(state.cursor, CursorState::InvalidatedByBarrier(Some(20)))
}));
client.handle_live_event(make_store_event(1, 0, 15)).await; client.handle_live_event(make_store_event(1, 0, 15)).await;
client.handle_live_event(make_store_event(1, 1, 30)).await; client.handle_live_event(make_store_event(1, 1, 30)).await;
...@@ -1300,8 +1276,8 @@ mod tests { ...@@ -1300,8 +1276,8 @@ mod tests {
{ {
let worker_state = client.get_or_create_worker_state(1); let worker_state = client.get_or_create_worker_state(1);
let mut worker_state = worker_state.lock().await; let mut worker_state = worker_state.lock().await;
worker_state.ranks.entry(0).or_default().cursor = RankCursor::Live(10); worker_state.ranks.entry(0).or_default().cursor = CursorState::Live(10);
worker_state.ranks.entry(1).or_default().cursor = RankCursor::Live(20); worker_state.ranks.entry(1).or_default().cursor = CursorState::Live(20);
} }
transport.push_action( transport.push_action(
...@@ -1327,6 +1303,9 @@ mod tests { ...@@ -1327,6 +1303,9 @@ mod tests {
}) })
}) })
.await; .await;
assert!(rank_state_matches(&client, key1, |state| {
matches!(state.cursor, CursorState::InvalidatedByBarrier(Some(20)))
}));
assert_eq!(transport.call_count(), 1); assert_eq!(transport.call_count(), 1);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment