Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
ed4d8068
Unverified
Commit
ed4d8068
authored
Mar 23, 2026
by
Janelle Cai
Committed by
GitHub
Mar 23, 2026
Browse files
feat: radix tree implementation (#7459)
parent
585b4df7
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1274 additions
and
20 deletions
+1274
-20
lib/bench/kv_router/mooncake_bench.rs
lib/bench/kv_router/mooncake_bench.rs
+28
-4
lib/kv-router/src/indexer/concurrent_radix_tree.rs
lib/kv-router/src/indexer/concurrent_radix_tree.rs
+0
-7
lib/kv-router/src/indexer/concurrent_radix_tree_compressed.rs
...kv-router/src/indexer/concurrent_radix_tree_compressed.rs
+1197
-0
lib/kv-router/src/indexer/mod.rs
lib/kv-router/src/indexer/mod.rs
+1
-0
lib/kv-router/src/indexer/tests.rs
lib/kv-router/src/indexer/tests.rs
+10
-1
lib/kv-router/src/indexer/thread_pool.rs
lib/kv-router/src/indexer/thread_pool.rs
+36
-8
lib/kv-router/src/lib.rs
lib/kv-router/src/lib.rs
+2
-0
No files found.
lib/bench/kv_router/mooncake_bench.rs
View file @
ed4d8068
...
...
@@ -11,7 +11,9 @@ use dynamo_kv_router::indexer::{
KvIndexer
,
KvIndexerInterface
,
KvIndexerMetrics
,
KvIndexerSharded
,
};
use
dynamo_kv_router
::
protocols
::{
KvCacheEvent
,
KvCacheEventData
,
RouterEvent
};
use
dynamo_kv_router
::{
ConcurrentRadixTree
,
PositionalIndexer
,
ThreadPoolIndexer
};
use
dynamo_kv_router
::{
ConcurrentRadixTree
,
ConcurrentRadixTreeCompressed
,
PositionalIndexer
,
ThreadPoolIndexer
,
};
use
serde
::
Serialize
;
use
std
::
sync
::
Arc
;
use
tokio
::
time
::{
Duration
,
Instant
};
...
...
@@ -47,6 +49,13 @@ enum IndexerArgs {
#[clap(long,
default_value
=
"16"
)]
num_event_workers
:
usize
,
},
/// Compressed concurrent radix tree indexer (compressed edges).
ConcurrentRadixTreeCompressed
{
/// Number of OS threads that consume and apply KV cache events.
#[clap(long,
default_value
=
"16"
)]
num_event_workers
:
usize
,
},
}
impl
IndexerArgs
{
...
...
@@ -75,6 +84,13 @@ impl IndexerArgs {
IndexerArgs
::
ConcurrentRadixTree
{
num_event_workers
}
=>
Arc
::
new
(
ThreadPoolIndexer
::
new
(
ConcurrentRadixTree
::
new
(),
num_event_workers
,
block_size
),
),
IndexerArgs
::
ConcurrentRadixTreeCompressed
{
num_event_workers
}
=>
{
Arc
::
new
(
ThreadPoolIndexer
::
new
(
ConcurrentRadixTreeCompressed
::
new
(),
num_event_workers
,
block_size
,
))
}
}
}
...
...
@@ -83,7 +99,10 @@ impl IndexerArgs {
}
fn
is_multi_threaded
(
name
:
&
str
)
->
bool
{
matches!
(
name
,
"nested-map"
|
"concurrent-radix-tree"
)
matches!
(
name
,
"nested-map"
|
"concurrent-radix-tree"
|
"concurrent-radix-tree-compressed"
)
}
/// Construct an indexer from a short name string.
...
...
@@ -103,9 +122,12 @@ impl IndexerArgs {
"concurrent-radix-tree"
=>
IndexerArgs
::
ConcurrentRadixTree
{
num_event_workers
:
nw
,
},
"concurrent-radix-tree-compressed"
=>
IndexerArgs
::
ConcurrentRadixTreeCompressed
{
num_event_workers
:
nw
,
},
_
=>
anyhow
::
bail!
(
"Unknown indexer '{}'. Valid names: radix-tree, radix-tree-sharded,
\
nested-map, concurrent-radix-tree"
,
nested-map, concurrent-radix-tree
, concurrent-radix-tree-compressed
"
,
name
),
};
...
...
@@ -125,7 +147,8 @@ struct Args {
/// Comma-separated list of indexer names to benchmark and compare on the
/// same plot. Overrides the subcommand indexer when present. Valid names:
/// radix-tree, radix-tree-sharded, nested-map, concurrent-radix-tree.
/// radix-tree, radix-tree-sharded, nested-map, concurrent-radix-tree,
/// concurrent-radix-tree-compressed.
#[clap(long,
value_delimiter
=
','
)]
compare
:
Vec
<
String
>
,
...
...
@@ -536,6 +559,7 @@ async fn main() -> anyhow::Result<()> {
IndexerArgs
::
RadixTreeSharded
{
..
}
=>
"radix-tree-sharded"
,
IndexerArgs
::
NestedMap
{
..
}
=>
"nested-map"
,
IndexerArgs
::
ConcurrentRadixTree
{
..
}
=>
"concurrent-radix-tree"
,
IndexerArgs
::
ConcurrentRadixTreeCompressed
{
..
}
=>
"concurrent-radix-tree-compressed"
,
};
vec!
[
name
.to_string
()]
}
else
{
...
...
lib/kv-router/src/indexer/concurrent_radix_tree.rs
View file @
ed4d8068
...
...
@@ -347,8 +347,6 @@ impl ConcurrentRadixTree {
let
num_blocks_added
=
op
.blocks
.len
();
// In each iteration, we lock the parent block and insert the worker into it from
// the previous iteration. This avoids locking a block twice.
for
block_data
in
op
.blocks
{
let
child
=
{
let
mut
parent_guard
=
current
.write
();
...
...
@@ -364,7 +362,6 @@ impl ConcurrentRadixTree {
// parent_guard is dropped at the end of this block
match
parent_guard
.children
.get
(
&
block_data
.tokens_hash
)
{
Some
(
existing
)
=>
{
// Verify our simplifying assumption: block_hash is uniform across workers
{
let
existing_guard
=
existing
.read
();
if
existing_guard
.block_hash
!=
Some
(
block_data
.block_hash
)
{
...
...
@@ -410,8 +407,6 @@ impl ConcurrentRadixTree {
}
}
// Insert worker into the last child (not yet handled since there is
// no subsequent iteration to pick it up).
if
needs_worker_insert
{
current
.write
()
.workers
.insert
(
worker
);
}
...
...
@@ -451,7 +446,6 @@ impl ConcurrentRadixTree {
continue
;
};
// Remove the worker from this block's worker set.
let
mut
guard
=
block
.write
();
guard
.workers
.remove
(
&
worker
);
if
guard
.workers
.is_empty
()
{
...
...
@@ -569,7 +563,6 @@ impl ConcurrentRadixTree {
// Queue entries: (current_block, parent_hash, tokens_hash)
let
mut
queue
=
VecDeque
::
new
();
// Process root's children first
{
let
root_guard
=
self
.root
.read
();
for
(
tokens_hash
,
child_block
)
in
&
root_guard
.children
{
...
...
lib/kv-router/src/indexer/concurrent_radix_tree_compressed.rs
0 → 100644
View file @
ed4d8068
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Concurrent Radix Tree (compressed trie) implementation for KV cache routing.
//!
//! This module provides a thread-safe radix tree data structure that enables concurrent
//! `find_matches` operations while maintaining correctness for write operations.
//!
//! Unlike a regular trie where each node holds a single hash, each node here holds
//! a compressed edge: a `Vec` of `(LocalBlockHash, ExternalSequenceBlockHash)` pairs.
//! Per-worker validity within each edge is tracked as a match index (cutoff) rather than
//! a simple present/absent flag. Nodes support splitting (when a partial match requires
//! divergent paths) but not merging.
//!
//! # Key Data Structures
//!
//! Each node contains:
//! - `edge`: the sequence of `(LocalBlockHash, ExternalSequenceBlockHash)` pairs
//! - `edge_index`: reverse lookup from `ExternalSequenceBlockHash` to position in `edge`,
//! enabling O(1) position queries during removal.
//! - `full_edge_workers`: workers with full edge coverage (fast path set)
//! - `worker_cutoffs`: workers with partial coverage, mapping to their match index `k`,
//! meaning the worker has cached blocks `edge[0..k]` with `0 < k < edge.len()`.
//! - `children`: child nodes keyed by the first `LocalBlockHash` of the child's edge
//!
//! # Removal Semantics
//!
//! When a remove event arrives for worker `w` at edge position `i`:
//! - current_cutoff = `edge.len()` if `w` is in `full_edge_workers`, else `worker_cutoffs[w]`
//! - If `i >= current_cutoff`: **no-op** (block is already beyond the worker's coverage)
//! - If `i < current_cutoff`: new_cutoff = `i`
//! - If new_cutoff == 0: remove worker entirely from this node
//! - Else: move worker to `worker_cutoffs[w] = new_cutoff`
//!
//! Removal does NOT perform structural splits. Multiple workers can independently reduce
//! their match indices without fragmenting the tree, accurately tracking each worker's
//! individual eviction patterns.
//!
//! # Split Semantics (during store only)
//!
//! When a new store requires splitting an edge at position `pos`:
//! - `full_edge_workers`: full in both prefix (unchanged) and suffix
//! - `worker_cutoffs[w] = k` where `k >= pos`: promoted to full in prefix;
//! in suffix with `adj = k - pos` (partial if `adj > 0`, absent if `adj == 0`)
//! - `worker_cutoffs[w] = k` where `k < pos`: unchanged in prefix, absent from suffix
//!
//! # Concurrency Model
//!
//! - Multiple `find_matches` can run in parallel (read locks only)
//! - Write operations (`apply_event`, `remove_worker`) acquire write locks
//! - Each worker thread owns its own `WorkerLookup`; no cross-thread lookup contention
//! - Deadlock prevention: always lock parent before child (hand-over-hand)
//! - Cross-thread splits: stale lookup entries are resolved lazily via `resolve_lookup`
//!
//! # Limitations vs RadixTree
//!
//! - Does NOT support `expiration_duration` / frequency tracking
//! - `new_with_frequency()` is not provided
//! - `find_matches` does not populate `OverlapScores.frequencies`
use
std
::
sync
::
Arc
;
use
dashmap
::
DashMap
;
use
parking_lot
::
RwLock
;
use
rustc_hash
::{
FxBuildHasher
,
FxHashMap
,
FxHashSet
};
use
std
::
collections
::
VecDeque
;
use
std
::
sync
::
atomic
::{
AtomicUsize
,
Ordering
};
use
super
::{
SyncIndexer
,
WorkerTask
};
use
crate
::
protocols
::
*
;
macro_rules!
read_lock
{
(
$self:expr
,
$lock:expr
)
=>
{
$lock
.read
()
};
}
/// Thread-safe shared reference to a Node.
type
SharedNode
=
Arc
<
RwLock
<
Node
>>
;
/// Per-worker block-hash → node map.
///
/// Maps each `ExternalSequenceBlockHash` to the node whose `edge` contains it.
/// Position within the edge is resolved via `Node::edge_index` (O(1)) rather than
/// stored here, keeping the map compact and correct across concurrent splits.
type
WorkerLookup
=
FxHashMap
<
ExternalSequenceBlockHash
,
SharedNode
>
;
/// A node in the concurrent radix tree.
///
/// Stores a compressed edge with per-worker match indices. Workers with full coverage
/// live in `full_edge_workers` for O(1) set membership tests on the common fast path.
/// Workers with partial coverage live in `worker_cutoffs`.
#[derive(Debug)]
struct
Node
{
/// Compressed edge: sequence of `(LocalBlockHash, ExternalSequenceBlockHash)` pairs.
/// Empty for the root node; non-empty for all other nodes.
edge
:
Vec
<
(
LocalBlockHash
,
ExternalSequenceBlockHash
)
>
,
/// Reverse index: `ExternalSequenceBlockHash` → position in `edge`.
/// Provides O(1) position lookup during removal, avoiding a linear scan.
edge_index
:
FxHashMap
<
ExternalSequenceBlockHash
,
u16
>
,
/// Workers with partial edge coverage. `worker_cutoffs[w] = k` means worker `w`
/// has cached `edge[0..k]`, where `0 < k < edge.len()`.
worker_cutoffs
:
FxHashMap
<
WorkerWithDpRank
,
u16
>
,
/// Workers with full edge coverage (match index == edge.len()).
full_edge_workers
:
FxHashSet
<
WorkerWithDpRank
>
,
/// Child nodes, keyed by the first `LocalBlockHash` of the child's edge.
children
:
FxHashMap
<
LocalBlockHash
,
SharedNode
>
,
}
impl
Node
{
fn
new
()
->
Self
{
Self
{
edge
:
Vec
::
new
(),
edge_index
:
FxHashMap
::
default
(),
worker_cutoffs
:
FxHashMap
::
default
(),
full_edge_workers
:
FxHashSet
::
default
(),
children
:
FxHashMap
::
default
(),
}
}
fn
has_any_workers
(
&
self
)
->
bool
{
!
self
.full_edge_workers
.is_empty
()
||
!
self
.worker_cutoffs
.is_empty
()
}
}
/// Data returned by [`ConcurrentRadixTreeCompressed::split_node`] for deferred lookup updates.
///
/// Callers must call [`ConcurrentRadixTreeCompressed::apply_split_lookup`] **after**
/// dropping the write guard to avoid holding the write lock during O(workers × edge_len)
/// HashMap insertions.
struct
SplitLookupData
{
suffix
:
SharedNode
,
}
/// Thread-safe radix tree (compressed trie) for concurrent KV cache lookups.
pub
struct
ConcurrentRadixTreeCompressed
{
/// The root of the radix tree. Has an empty edge and only contains children.
root
:
SharedNode
,
tree_sizes
:
DashMap
<
WorkerWithDpRank
,
AtomicUsize
,
FxBuildHasher
>
,
}
impl
Default
for
ConcurrentRadixTreeCompressed
{
fn
default
()
->
Self
{
Self
::
new
()
}
}
// Dropping nodes can cause a cascade of drops that overflow the stack.
// This custom drop uses an iterative approach.
impl
Drop
for
ConcurrentRadixTreeCompressed
{
fn
drop
(
&
mut
self
)
{
let
mut
stack
:
Vec
<
SharedNode
>
=
Vec
::
new
();
{
let
mut
root
=
self
.root
.write
();
stack
.extend
(
root
.children
.drain
()
.map
(|(
_
,
v
)|
v
));
}
while
let
Some
(
node
)
=
stack
.pop
()
{
if
let
Ok
(
rwlock
)
=
Arc
::
try_unwrap
(
node
)
{
let
mut
inner
=
rwlock
.into_inner
();
stack
.extend
(
inner
.children
.drain
()
.map
(|(
_
,
v
)|
v
));
}
}
}
}
impl
ConcurrentRadixTreeCompressed
{
pub
fn
new
()
->
Self
{
Self
{
root
:
Arc
::
new
(
RwLock
::
new
(
Node
::
new
())),
tree_sizes
:
DashMap
::
with_hasher
(
FxBuildHasher
),
}
}
// ------------------------------------------------------------------
// Lookup resolution helpers
// ------------------------------------------------------------------
/// Search a node's subtree for the node whose edge contains `hash`.
/// Used to resolve stale lookup entries caused by cross-thread splits.
fn
find_in_subtree
(
start
:
&
SharedNode
,
hash
:
ExternalSequenceBlockHash
)
->
Option
<
SharedNode
>
{
let
mut
stack
=
Vec
::
new
();
{
let
guard
=
start
.read
();
stack
.extend
(
guard
.children
.values
()
.cloned
());
}
while
let
Some
(
node
)
=
stack
.pop
()
{
let
guard
=
node
.read
();
if
guard
.edge_index
.contains_key
(
&
hash
)
{
drop
(
guard
);
return
Some
(
node
);
}
stack
.extend
(
guard
.children
.values
()
.cloned
());
}
None
}
/// Look up `hash` in a worker's lookup, resolving stale entries caused by
/// cross-thread splits. Returns the `SharedNode` whose edge contains `hash`.
fn
resolve_lookup
(
worker_lookup
:
&
mut
WorkerLookup
,
hash
:
ExternalSequenceBlockHash
,
)
->
Option
<
SharedNode
>
{
let
node
=
worker_lookup
.get
(
&
hash
)
?
.clone
();
// Fast path: hash is still in this node's edge_index.
let
found
=
{
let
guard
=
node
.read
();
guard
.edge_index
.contains_key
(
&
hash
)
};
if
found
{
return
Some
(
node
);
}
// Slow path: hash was moved to a descendant by a cross-thread split.
let
resolved
=
Self
::
find_in_subtree
(
&
node
,
hash
)
?
;
worker_lookup
.insert
(
hash
,
resolved
.clone
());
Some
(
resolved
)
}
// ------------------------------------------------------------------
// Split helpers
// ------------------------------------------------------------------
/// Split a node's edge at position `pos` (caller holds the node's write lock).
///
/// Splits `node.edge` into prefix `edge[..pos]` (stays in `node`) and suffix
/// `edge[pos..]` (moved to a new child node). Updates `edge_index` for both
/// halves and distributes workers according to their match indices.
///
/// Worker distribution:
/// - `full_edge_workers`: full in both prefix (unchanged) and suffix
/// - `worker_cutoffs[w] = k`, `k >= pos`: promoted to full in prefix;
/// suffix gets `adj = k - pos` (partial if > 0, absent if == 0)
/// - `worker_cutoffs[w] = k`, `k < pos`: unchanged in prefix, absent from suffix
///
/// Returns `SplitLookupData`; caller must call `apply_split_lookup` after releasing
/// the write guard.
///
/// `pos` must satisfy `0 < pos < node.edge.len()`.
fn
split_node
(
node
:
&
mut
Node
,
pos
:
usize
)
->
SplitLookupData
{
debug_assert!
(
pos
>
0
&&
pos
<
node
.edge
.len
(),
"split position {pos} out of range for edge length {}"
,
node
.edge
.len
()
);
let
suffix_edge
=
node
.edge
.split_off
(
pos
);
let
suffix_first_local
=
suffix_edge
[
0
]
.0
;
let
prefix_len
=
pos
as
u16
;
// Build suffix edge_index (positions reindexed from 0).
let
mut
suffix_edge_index
=
FxHashMap
::
with_capacity_and_hasher
(
suffix_edge
.len
(),
FxBuildHasher
);
for
(
i
,
&
(
_
,
h
))
in
suffix_edge
.iter
()
.enumerate
()
{
suffix_edge_index
.insert
(
h
,
i
as
u16
);
}
// Remove suffix hashes from the prefix edge_index.
for
&
(
_
,
h
)
in
&
suffix_edge
{
node
.edge_index
.remove
(
&
h
);
}
// Distribute workers: full stays full in both; partial workers may be promoted.
let
mut
suffix_full
=
FxHashSet
::
with_capacity_and_hasher
(
node
.full_edge_workers
.len
(),
FxBuildHasher
);
let
mut
suffix_cutoffs
=
FxHashMap
::
with_capacity_and_hasher
(
node
.worker_cutoffs
.len
(),
FxBuildHasher
);
let
mut
to_promote
:
Vec
<
WorkerWithDpRank
>
=
Vec
::
new
();
for
&
w
in
&
node
.full_edge_workers
{
suffix_full
.insert
(
w
);
}
for
(
&
w
,
&
k
)
in
&
node
.worker_cutoffs
{
if
k
>=
prefix_len
{
// Covers the full prefix → promote to full in prefix.
to_promote
.push
(
w
);
let
adj
=
k
-
prefix_len
;
if
adj
>
0
{
suffix_cutoffs
.insert
(
w
,
adj
);
}
// adj == 0: exact split point, absent from suffix.
}
// k < prefix_len: stays partial in prefix (same k), absent from suffix.
}
for
w
in
&
to_promote
{
node
.worker_cutoffs
.remove
(
w
);
node
.full_edge_workers
.insert
(
*
w
);
}
let
suffix_children
=
std
::
mem
::
take
(
&
mut
node
.children
);
let
suffix
=
Arc
::
new
(
RwLock
::
new
(
Node
{
edge
:
suffix_edge
,
edge_index
:
suffix_edge_index
,
worker_cutoffs
:
suffix_cutoffs
,
full_edge_workers
:
suffix_full
,
children
:
suffix_children
,
}));
node
.children
.insert
(
suffix_first_local
,
suffix
.clone
());
SplitLookupData
{
suffix
}
}
/// Apply deferred lookup updates after `split_node`.
///
/// Updates worker lookup maps so entries for blocks that moved to the suffix now
/// point to the suffix node. Must be called **after** the write guard is dropped.
fn
apply_split_lookup
(
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
split
:
SplitLookupData
,
)
{
let
guard
=
split
.suffix
.read
();
for
&
w
in
&
guard
.full_edge_workers
{
if
let
Some
(
wl
)
=
lookup
.get_mut
(
&
w
)
{
for
&
(
_
,
h
)
in
&
guard
.edge
{
wl
.insert
(
h
,
split
.suffix
.clone
());
}
}
}
for
(
&
w
,
&
k
)
in
&
guard
.worker_cutoffs
{
if
let
Some
(
wl
)
=
lookup
.get_mut
(
&
w
)
{
for
&
(
_
,
h
)
in
&
guard
.edge
[
..
k
as
usize
]
{
wl
.insert
(
h
,
split
.suffix
.clone
());
}
}
}
}
// ------------------------------------------------------------------
// find_matches
// ------------------------------------------------------------------
/// Traverse the radix tree to find the best match for a given sequence of
/// [`LocalBlockHash`]es.
///
/// Workers in `full_edge_workers` are tracked in the `active` set and continue
/// into children. Workers in `worker_cutoffs` are scored at the node where their
/// cutoff falls short and are never propagated into children.
pub
fn
find_matches_impl
(
&
self
,
sequence
:
&
[
LocalBlockHash
],
early_exit
:
bool
,
)
->
OverlapScores
{
let
mut
scores
=
OverlapScores
::
new
();
if
sequence
.is_empty
()
{
return
scores
;
}
let
mut
active
:
FxHashSet
<
WorkerWithDpRank
>
=
FxHashSet
::
default
();
let
mut
active_count
:
usize
=
0
;
let
mut
matched_depth
:
u32
=
0
;
let
mut
seq_pos
:
usize
=
0
;
let
mut
first_node
=
true
;
let
mut
next_child
=
{
let
root_guard
=
read_lock!
(
self
,
self
.root
);
root_guard
.children
.get
(
&
sequence
[
0
])
.cloned
()
};
loop
{
if
seq_pos
>=
sequence
.len
()
{
break
;
}
let
child
=
match
next_child
.take
()
{
Some
(
c
)
=>
c
,
None
=>
break
,
};
let
edge_len
;
let
edge_match_len
;
{
let
guard
=
read_lock!
(
self
,
child
);
edge_len
=
guard
.edge
.len
();
let
walk_len
=
edge_len
.min
(
sequence
.len
()
-
seq_pos
);
// First element is guaranteed by the parent's children HashMap lookup.
let
mut
match_len
=
1
;
for
i
in
1
..
walk_len
{
if
guard
.edge
[
i
]
.0
!=
sequence
[
seq_pos
+
i
]
{
break
;
}
match_len
+=
1
;
}
edge_match_len
=
match_len
;
let
prev_depth
=
matched_depth
;
if
first_node
{
// Seed active set from full-edge workers (they can continue to children).
// Score partial workers immediately; they never continue into children.
active
=
guard
.full_edge_workers
.clone
();
active_count
=
active
.len
();
for
(
&
w
,
&
k
)
in
&
guard
.worker_cutoffs
{
let
contribution
=
(
k
as
usize
)
.min
(
edge_match_len
)
as
u32
;
if
contribution
>
0
{
scores
.scores
.insert
(
w
,
contribution
);
}
}
first_node
=
false
;
}
else
{
let
has_partial
=
!
guard
.worker_cutoffs
.is_empty
();
if
has_partial
{
// Slow path: check each active worker against both maps.
active
.retain
(|
w
|
{
if
guard
.full_edge_workers
.contains
(
w
)
{
true
}
else
if
let
Some
(
&
k
)
=
guard
.worker_cutoffs
.get
(
w
)
{
let
effective
=
(
k
as
usize
)
.min
(
edge_match_len
)
as
u32
;
scores
.scores
.insert
(
*
w
,
prev_depth
+
effective
);
false
}
else
{
scores
.scores
.insert
(
*
w
,
prev_depth
);
false
}
});
}
else
{
// Fast path: no partial workers — all coverage is full or absent.
let
full_count
=
guard
.full_edge_workers
.len
();
if
full_count
!=
active_count
{
active
.retain
(|
w
|
{
if
guard
.full_edge_workers
.contains
(
w
)
{
true
}
else
{
scores
.scores
.insert
(
*
w
,
prev_depth
);
false
}
});
}
// full_count == active_count: sets are identical (fast path).
}
active_count
=
active
.len
();
}
next_child
=
if
edge_match_len
==
edge_len
&&
active_count
>
0
&&
seq_pos
+
edge_match_len
<
sequence
.len
()
{
guard
.children
.get
(
&
sequence
[
seq_pos
+
edge_match_len
])
.cloned
()
}
else
{
None
};
}
if
active_count
==
0
{
break
;
}
matched_depth
+=
edge_match_len
as
u32
;
if
edge_match_len
<
edge_len
{
break
;
}
seq_pos
+=
edge_match_len
;
if
early_exit
&&
active_count
==
1
{
break
;
}
}
for
worker
in
&
active
{
scores
.scores
.insert
(
*
worker
,
matched_depth
);
}
for
worker
in
scores
.scores
.keys
()
{
if
let
Some
(
s
)
=
self
.tree_sizes
.get
(
worker
)
{
scores
.tree_sizes
.insert
(
*
worker
,
s
.load
(
Ordering
::
Relaxed
));
}
}
scores
}
// ------------------------------------------------------------------
// apply_event dispatch
// ------------------------------------------------------------------
fn
apply_event
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
event
:
RouterEvent
,
)
->
Result
<
(),
KvCacheEventError
>
{
let
(
worker_id
,
kv_event
)
=
(
event
.worker_id
,
event
.event
);
let
(
id
,
op
)
=
(
kv_event
.event_id
,
kv_event
.data
);
let
worker
=
WorkerWithDpRank
::
new
(
worker_id
,
kv_event
.dp_rank
);
match
op
{
KvCacheEventData
::
Stored
(
op
)
=>
self
.apply_stored
(
lookup
,
worker
,
op
,
id
),
KvCacheEventData
::
Removed
(
op
)
=>
self
.apply_removed
(
lookup
,
worker
,
op
,
id
),
KvCacheEventData
::
Cleared
=>
{
lookup
.entry
(
worker
)
.or_default
();
self
.tree_sizes
.entry
(
worker
)
.or_insert_with
(||
AtomicUsize
::
new
(
0
));
self
.clear_all_blocks
(
lookup
,
worker
.worker_id
);
Ok
(())
}
}
}
// ------------------------------------------------------------------
// apply_stored
// ------------------------------------------------------------------
fn
apply_stored
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker
:
WorkerWithDpRank
,
op
:
KvCacheStoreData
,
id
:
u64
,
)
->
Result
<
(),
KvCacheEventError
>
{
lookup
.entry
(
worker
)
.or_default
();
let
parent
=
match
op
.parent_hash
{
Some
(
parent_hash
)
=>
{
// Retry loop: re-resolve if a concurrent split moves parent_hash
// into a descendant between resolve_lookup and the write lock below.
loop
{
let
node
=
{
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
match
Self
::
resolve_lookup
(
wl
,
parent_hash
)
{
Some
(
n
)
=>
n
,
None
=>
{
tracing
::
warn!
(
worker_id
=
worker
.worker_id
.to_string
(),
dp_rank
=
worker
.dp_rank
,
id
,
parent_hash
=
?
op
.parent_hash
,
num_blocks
=
op
.blocks
.len
(),
"Failed to find parent block; skipping store operation"
);
return
Err
(
KvCacheEventError
::
ParentBlockNotFound
);
}
}
};
// Verify the worker still covers parent_hash. A prior removal may
// have reduced the worker's cutoff past this position, leaving a
// stale entry in the lookup map.
{
let
guard
=
node
.read
();
if
let
Some
(
&
pos_u16
)
=
guard
.edge_index
.get
(
&
parent_hash
)
{
let
pos
=
pos_u16
as
usize
;
let
is_full
=
guard
.full_edge_workers
.contains
(
&
worker
);
let
cutoff
=
if
is_full
{
guard
.edge
.len
()
}
else
{
guard
.worker_cutoffs
.get
(
&
worker
)
.copied
()
.map
(|
k
|
k
as
usize
)
.unwrap_or
(
0
)
};
if
pos
>=
cutoff
{
tracing
::
warn!
(
worker_id
=
worker
.worker_id
.to_string
(),
dp_rank
=
worker
.dp_rank
,
id
,
parent_hash
=
?
parent_hash
,
pos
,
cutoff
,
"Stale parent: worker no longer covers parent_hash; rejecting store"
);
drop
(
guard
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
wl
.remove
(
&
parent_hash
);
return
Err
(
KvCacheEventError
::
ParentBlockNotFound
);
}
}
}
// If parent_hash is not the tail of the node's edge, split so it becomes tail.
// We check edge_index inside the write lock: if parent_hash is absent, a
// concurrent split moved it to a descendant — retry resolve from the top.
let
split_data
=
{
let
mut
guard
=
node
.write
();
if
!
guard
.edge_index
.contains_key
(
&
parent_hash
)
{
// Concurrent split moved parent_hash; retry resolve.
continue
;
}
if
!
guard
.edge
.is_empty
()
&&
guard
.edge
.last
()
.unwrap
()
.1
!=
parent_hash
{
guard
.edge
.iter
()
.position
(|
&
(
_
,
h
)|
h
==
parent_hash
)
.map
(|
pos
|
Self
::
split_node
(
&
mut
guard
,
pos
+
1
))
}
else
{
None
}
};
if
let
Some
(
split
)
=
split_data
{
Self
::
apply_split_lookup
(
lookup
,
split
);
}
break
node
;
}
}
None
=>
self
.root
.clone
(),
};
let
num_blocks
=
op
.blocks
.len
();
self
.insert_blocks_from
(
lookup
,
worker
,
&
parent
,
op
.parent_hash
,
&
op
.blocks
);
match
self
.tree_sizes
.get
(
&
worker
)
{
Some
(
size
)
=>
{
size
.fetch_add
(
num_blocks
,
Ordering
::
Relaxed
);
}
None
=>
{
self
.tree_sizes
.insert
(
worker
,
AtomicUsize
::
new
(
num_blocks
));
}
}
Ok
(())
}
fn
insert_blocks_from
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker
:
WorkerWithDpRank
,
parent
:
&
SharedNode
,
seed_hash
:
Option
<
ExternalSequenceBlockHash
>
,
blocks
:
&
[
KvCacheStoredBlockData
],
)
{
let
mut
current_parent
=
parent
.clone
();
let
mut
remaining
=
blocks
;
// Track the last ExternalSequenceBlockHash we matched to detect if
// `current_parent` was split by a concurrent thread between iterations.
// A split shortens `current_parent`'s edge and moves our last-matched
// hash into a new suffix child. We detect this cheaply inside the write
// lock we already take on `current_parent`, so no extra lock is needed
// in the common case.
//
// Seeded with parent_hash so the very first iteration detects a split
// that occurred after apply_stored released its write lock but before
// we acquired ours here.
let
mut
last_ext_hash
:
Option
<
ExternalSequenceBlockHash
>
=
seed_hash
;
while
!
remaining
.is_empty
()
{
let
first_local
=
remaining
[
0
]
.tokens_hash
;
let
child
=
{
let
mut
parent_guard
=
current_parent
.write
();
// Detect concurrent split: if last_ext_hash is no longer in
// this node's edge_index, another thread shortened this edge.
// Drop the lock, re-resolve to the correct suffix node, retry.
if
let
Some
(
hash
)
=
last_ext_hash
&&
!
parent_guard
.edge_index
.contains_key
(
&
hash
)
{
drop
(
parent_guard
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
if
let
Some
(
resolved
)
=
Self
::
resolve_lookup
(
wl
,
hash
)
{
current_parent
=
resolved
;
}
continue
;
}
match
parent_guard
.children
.get
(
&
first_local
)
.cloned
()
{
Some
(
existing
)
=>
existing
,
None
=>
{
// No existing child — create a new node for all remaining blocks.
let
edge
:
Vec
<
(
LocalBlockHash
,
ExternalSequenceBlockHash
)
>
=
remaining
.iter
()
.map
(|
b
|
(
b
.tokens_hash
,
b
.block_hash
))
.collect
();
let
mut
edge_index
=
FxHashMap
::
with_capacity_and_hasher
(
edge
.len
(),
FxBuildHasher
);
for
(
i
,
&
(
_
,
h
))
in
edge
.iter
()
.enumerate
()
{
edge_index
.insert
(
h
,
i
as
u16
);
}
let
mut
full_edge_workers
=
FxHashSet
::
with_capacity_and_hasher
(
1
,
FxBuildHasher
);
full_edge_workers
.insert
(
worker
);
let
new_node
=
Arc
::
new
(
RwLock
::
new
(
Node
{
edge
,
edge_index
,
worker_cutoffs
:
FxHashMap
::
default
(),
full_edge_workers
,
children
:
FxHashMap
::
default
(),
}));
parent_guard
.children
.insert
(
first_local
,
new_node
.clone
());
drop
(
parent_guard
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
for
b
in
remaining
{
wl
.insert
(
b
.block_hash
,
new_node
.clone
());
}
return
;
}
}
};
{
let
mut
child_guard
=
child
.write
();
let
edge_len
=
child_guard
.edge
.len
();
let
mut
match_len
=
0
;
for
(
edge_elem
,
rem_elem
)
in
child_guard
.edge
.iter
()
.zip
(
remaining
.iter
())
{
if
edge_elem
.0
!=
rem_elem
.tokens_hash
{
break
;
}
if
edge_elem
.1
!=
rem_elem
.block_hash
{
tracing
::
warn!
(
expected
=
?
rem_elem
.block_hash
,
actual
=
?
edge_elem
.1
,
"block_hash mismatch: sequence hashes should be uniform across workers"
);
}
match_len
+=
1
;
}
debug_assert!
(
match_len
>=
1
,
"first hash must match since child was found by it"
);
if
match_len
<
edge_len
{
// Partial edge match: split at match_len, add worker to prefix.
let
split
=
Self
::
split_node
(
&
mut
child_guard
,
match_len
);
// Ensure worker has full coverage of the prefix.
if
!
child_guard
.full_edge_workers
.contains
(
&
worker
)
{
child_guard
.worker_cutoffs
.remove
(
&
worker
);
child_guard
.full_edge_workers
.insert
(
worker
);
}
let
tail
=
&
remaining
[
match_len
..
];
if
!
tail
.is_empty
()
{
// Create new tail node for the worker's additional blocks.
let
edge
:
Vec
<
(
LocalBlockHash
,
ExternalSequenceBlockHash
)
>
=
tail
.iter
()
.map
(|
b
|
(
b
.tokens_hash
,
b
.block_hash
))
.collect
();
let
mut
edge_index
=
FxHashMap
::
with_capacity_and_hasher
(
edge
.len
(),
FxBuildHasher
);
for
(
i
,
&
(
_
,
h
))
in
edge
.iter
()
.enumerate
()
{
edge_index
.insert
(
h
,
i
as
u16
);
}
let
mut
full_edge_workers
=
FxHashSet
::
with_capacity_and_hasher
(
1
,
FxBuildHasher
);
full_edge_workers
.insert
(
worker
);
let
tail_first_local
=
tail
[
0
]
.tokens_hash
;
let
new_node
=
Arc
::
new
(
RwLock
::
new
(
Node
{
edge
,
edge_index
,
worker_cutoffs
:
FxHashMap
::
default
(),
full_edge_workers
,
children
:
FxHashMap
::
default
(),
}));
child_guard
.children
.insert
(
tail_first_local
,
new_node
.clone
());
drop
(
child_guard
);
Self
::
apply_split_lookup
(
lookup
,
split
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
for
b
in
&
remaining
[
..
match_len
]
{
wl
.insert
(
b
.block_hash
,
child
.clone
());
}
for
b
in
tail
{
wl
.insert
(
b
.block_hash
,
new_node
.clone
());
}
}
else
{
drop
(
child_guard
);
Self
::
apply_split_lookup
(
lookup
,
split
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
for
b
in
&
remaining
[
..
match_len
]
{
wl
.insert
(
b
.block_hash
,
child
.clone
());
}
}
return
;
}
// Full edge match: upgrade worker to full coverage if necessary.
if
!
child_guard
.full_edge_workers
.contains
(
&
worker
)
{
child_guard
.worker_cutoffs
.remove
(
&
worker
);
child_guard
.full_edge_workers
.insert
(
worker
);
}
drop
(
child_guard
);
let
wl
=
lookup
.get_mut
(
&
worker
)
.unwrap
();
for
b
in
&
remaining
[
..
edge_len
]
{
wl
.insert
(
b
.block_hash
,
child
.clone
());
}
last_ext_hash
=
Some
(
remaining
[
edge_len
-
1
]
.block_hash
);
remaining
=
&
remaining
[
edge_len
..
];
current_parent
=
child
;
}
}
}
// ------------------------------------------------------------------
// apply_removed
// ------------------------------------------------------------------
/// Apply a remove operation (eviction).
///
/// For each evicted block hash, finds its position in the node via `edge_index` (O(1)).
/// Updates the worker's match index without splitting the tree:
/// - `pos >= current_cutoff`: no-op (already beyond coverage)
/// - `pos < current_cutoff`: `new_cutoff = pos`; moves worker to `worker_cutoffs`
/// or removes entirely if `new_cutoff == 0`.
fn
apply_removed
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker
:
WorkerWithDpRank
,
op
:
KvCacheRemoveData
,
id
:
u64
,
)
->
Result
<
(),
KvCacheEventError
>
{
if
!
lookup
.contains_key
(
&
worker
)
{
return
Err
(
KvCacheEventError
::
BlockNotFound
);
}
let
mut
total_removed
=
0u
size
;
'outer
:
for
block_hash
in
op
.block_hashes
{
let
mut
cur_node
=
{
let
Some
(
wl
)
=
lookup
.get_mut
(
&
worker
)
else
{
continue
;
};
match
Self
::
resolve_lookup
(
wl
,
block_hash
)
{
Some
(
n
)
=>
n
,
None
=>
{
tracing
::
debug!
(
worker_id
=
worker
.worker_id
.to_string
(),
dp_rank
=
worker
.dp_rank
,
id
,
block_hash
=
?
block_hash
,
"Block not found during remove; skipping"
);
continue
;
}
}
};
loop
{
// Returns Some(removed_count) on success, None if the node is stale
// (hash was moved to a descendant by a concurrent split).
let
update
:
Option
<
usize
>
=
{
let
mut
guard
=
cur_node
.write
();
match
guard
.edge_index
.get
(
&
block_hash
)
.copied
()
{
None
=>
None
,
// stale: hash moved to a child
Some
(
pos_u16
)
=>
{
let
pos
=
pos_u16
as
usize
;
// Determine the worker's current match index.
// Use 0 as sentinel for "not tracked" → pos >= 0 is always true → no-op.
let
is_full
=
guard
.full_edge_workers
.contains
(
&
worker
);
let
current_cutoff
=
if
is_full
{
guard
.edge
.len
()
}
else
{
guard
.worker_cutoffs
.get
(
&
worker
)
.copied
()
.map
(|
k
|
k
as
usize
)
.unwrap_or
(
0
)
};
if
pos
>=
current_cutoff
{
// Block is at or beyond current coverage — no-op.
Some
(
0
)
}
else
{
let
new_cutoff
=
pos
;
let
removed
=
current_cutoff
-
new_cutoff
;
if
new_cutoff
==
0
{
// Worker loses all coverage in this node.
if
is_full
{
guard
.full_edge_workers
.remove
(
&
worker
);
}
else
{
guard
.worker_cutoffs
.remove
(
&
worker
);
}
}
else
{
// Worker retains coverage of edge[0..new_cutoff].
if
is_full
{
guard
.full_edge_workers
.remove
(
&
worker
);
}
guard
.worker_cutoffs
.insert
(
worker
,
new_cutoff
as
u16
);
}
if
!
guard
.has_any_workers
()
{
guard
.children
.clear
();
}
Some
(
removed
)
}
}
}
};
match
update
{
Some
(
removed
)
=>
{
total_removed
+=
removed
;
// Remove this specific hash from the lookup. Other hashes at
// positions > new_cutoff remain and are cleaned up lazily when
// their own remove events arrive (they will be no-ops).
if
let
Some
(
wl
)
=
lookup
.get_mut
(
&
worker
)
{
wl
.remove
(
&
block_hash
);
}
continue
'outer
;
}
None
=>
{
// Hash was moved to a descendant by a concurrent split.
match
Self
::
find_in_subtree
(
&
cur_node
,
block_hash
)
{
Some
(
resolved
)
=>
{
if
let
Some
(
wl
)
=
lookup
.get_mut
(
&
worker
)
{
wl
.insert
(
block_hash
,
resolved
.clone
());
}
cur_node
=
resolved
;
// Retry the inner loop with the resolved node.
}
None
=>
{
// Hash not found anywhere — evicted by a concurrent clear.
tracing
::
debug!
(
worker_id
=
worker
.worker_id
.to_string
(),
dp_rank
=
worker
.dp_rank
,
id
,
block_hash
=
?
block_hash
,
"Block not found in subtree during remove; skipping"
);
if
let
Some
(
wl
)
=
lookup
.get_mut
(
&
worker
)
{
wl
.remove
(
&
block_hash
);
}
continue
'outer
;
}
}
}
}
}
}
match
self
.tree_sizes
.get
(
&
worker
)
{
Some
(
size
)
=>
{
size
.fetch_update
(
Ordering
::
Relaxed
,
Ordering
::
Relaxed
,
|
v
|
{
Some
(
v
.saturating_sub
(
total_removed
))
})
.ok
();
}
None
=>
{
self
.tree_sizes
.insert
(
worker
,
AtomicUsize
::
new
(
0
));
}
}
Ok
(())
}
// ------------------------------------------------------------------
// Worker removal / clearing
// ------------------------------------------------------------------
fn
remove_or_clear_worker_blocks
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker_id
:
WorkerId
,
keep_worker
:
bool
,
)
{
let
workers
:
Vec
<
WorkerWithDpRank
>
=
lookup
.keys
()
.filter
(|
w
|
w
.worker_id
==
worker_id
)
.copied
()
.collect
();
for
worker
in
workers
{
if
let
Some
(
worker_lookup
)
=
lookup
.remove
(
&
worker
)
{
let
mut
seen
=
FxHashSet
::
<
usize
>
::
default
();
for
(
_
,
node
)
in
worker_lookup
.into_iter
()
{
let
ptr
=
Arc
::
as_ptr
(
&
node
)
as
usize
;
if
!
seen
.insert
(
ptr
)
{
continue
;
}
let
mut
guard
=
node
.write
();
guard
.full_edge_workers
.remove
(
&
worker
);
guard
.worker_cutoffs
.remove
(
&
worker
);
if
!
guard
.has_any_workers
()
{
guard
.children
.clear
();
}
}
if
keep_worker
{
lookup
.insert
(
worker
,
FxHashMap
::
default
());
if
let
Some
(
size
)
=
self
.tree_sizes
.get
(
&
worker
)
{
size
.store
(
0
,
Ordering
::
Relaxed
);
}
}
else
{
self
.tree_sizes
.remove
(
&
worker
);
}
}
}
}
fn
remove_worker_dp_rank
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker_id
:
WorkerId
,
dp_rank
:
DpRank
,
)
{
let
key
=
WorkerWithDpRank
{
worker_id
,
dp_rank
};
if
let
Some
(
worker_lookup
)
=
lookup
.remove
(
&
key
)
{
let
mut
seen
=
FxHashSet
::
<
usize
>
::
default
();
for
(
_
,
node
)
in
worker_lookup
.into_iter
()
{
let
ptr
=
Arc
::
as_ptr
(
&
node
)
as
usize
;
if
!
seen
.insert
(
ptr
)
{
continue
;
}
let
mut
guard
=
node
.write
();
guard
.full_edge_workers
.remove
(
&
key
);
guard
.worker_cutoffs
.remove
(
&
key
);
if
!
guard
.has_any_workers
()
{
guard
.children
.clear
();
}
}
self
.tree_sizes
.remove
(
&
key
);
}
}
fn
clear_all_blocks
(
&
self
,
lookup
:
&
mut
FxHashMap
<
WorkerWithDpRank
,
WorkerLookup
>
,
worker_id
:
WorkerId
,
)
{
self
.remove_or_clear_worker_blocks
(
lookup
,
worker_id
,
true
);
}
// ------------------------------------------------------------------
// Accessors
// ------------------------------------------------------------------
pub
fn
get_workers
(
&
self
)
->
Vec
<
WorkerId
>
{
let
mut
worker_ids
:
Vec
<
WorkerId
>
=
self
.tree_sizes
.iter
()
.map
(|
entry
|
entry
.key
()
.worker_id
)
.collect
();
worker_ids
.sort_unstable
();
worker_ids
.dedup
();
worker_ids
}
// ------------------------------------------------------------------
// Tree dump
// ------------------------------------------------------------------
fn
dump_tree_as_events
(
&
self
)
->
Vec
<
RouterEvent
>
{
tracing
::
debug!
(
"Dumping concurrent radix tree as events"
);
let
mut
events
=
Vec
::
new
();
let
mut
event_id
=
0u64
;
let
mut
queue
=
VecDeque
::
new
();
{
let
root_guard
=
self
.root
.read
();
for
child_node
in
root_guard
.children
.values
()
{
queue
.push_back
((
child_node
.clone
(),
None
::
<
ExternalSequenceBlockHash
>
));
}
}
while
let
Some
((
start_node
,
parent_hash
))
=
queue
.pop_front
()
{
let
mut
merged_edge
:
Vec
<
(
LocalBlockHash
,
ExternalSequenceBlockHash
)
>
=
Vec
::
new
();
let
mut
current
=
start_node
;
loop
{
let
guard
=
current
.read
();
if
!
guard
.has_any_workers
()
&&
guard
.children
.is_empty
()
{
break
;
}
merged_edge
.extend_from_slice
(
&
guard
.edge
);
let
live_children
:
Vec
<
SharedNode
>
=
guard
.children
.values
()
.filter
(|
child
|
{
let
cg
=
child
.read
();
cg
.has_any_workers
()
||
!
cg
.children
.is_empty
()
})
.cloned
()
.collect
();
// Merge condition: this node is a pure passthrough that can be
// collapsed with its single child. Requires identical worker sets
// and no partial-coverage cutoffs on either side.
let
can_merge
=
guard
.worker_cutoffs
.is_empty
()
&&
live_children
.len
()
==
1
&&
{
let
cg
=
live_children
[
0
]
.read
();
cg
.full_edge_workers
==
guard
.full_edge_workers
&&
cg
.worker_cutoffs
.is_empty
()
&&
cg
.has_any_workers
()
};
if
can_merge
{
let
next
=
live_children
[
0
]
.clone
();
drop
(
guard
);
current
=
next
;
continue
;
}
if
merged_edge
.is_empty
()
{
drop
(
guard
);
break
;
}
let
full_blocks
:
Vec
<
KvCacheStoredBlockData
>
=
merged_edge
.iter
()
.map
(|
&
(
local
,
ext
)|
KvCacheStoredBlockData
{
tokens_hash
:
local
,
block_hash
:
ext
,
mm_extra_info
:
None
,
})
.collect
();
let
last_ext
=
merged_edge
.last
()
.unwrap
()
.1
;
for
&
worker
in
&
guard
.full_edge_workers
{
events
.push
(
RouterEvent
::
new
(
worker
.worker_id
,
KvCacheEvent
{
event_id
,
data
:
KvCacheEventData
::
Stored
(
KvCacheStoreData
{
parent_hash
,
blocks
:
full_blocks
.clone
(),
}),
dp_rank
:
worker
.dp_rank
,
},
));
event_id
+=
1
;
}
for
(
&
worker
,
&
k
)
in
&
guard
.worker_cutoffs
{
events
.push
(
RouterEvent
::
new
(
worker
.worker_id
,
KvCacheEvent
{
event_id
,
data
:
KvCacheEventData
::
Stored
(
KvCacheStoreData
{
parent_hash
,
blocks
:
full_blocks
[
..
k
as
usize
]
.to_vec
(),
}),
dp_rank
:
worker
.dp_rank
,
},
));
event_id
+=
1
;
}
for
child
in
live_children
{
queue
.push_back
((
child
,
Some
(
last_ext
)));
}
drop
(
guard
);
break
;
}
}
events
}
}
// ============================================================================
// SyncIndexer implementation for ConcurrentRadixTreeCompressed
// ============================================================================
impl
SyncIndexer
for
ConcurrentRadixTreeCompressed
{
fn
worker
(
&
self
,
event_receiver
:
flume
::
Receiver
<
WorkerTask
>
)
->
anyhow
::
Result
<
()
>
{
let
mut
lookup
=
FxHashMap
::
default
();
while
let
Ok
(
task
)
=
event_receiver
.recv
()
{
match
task
{
WorkerTask
::
Event
(
event
)
=>
{
if
let
Err
(
e
)
=
self
.apply_event
(
&
mut
lookup
,
event
)
{
tracing
::
warn!
(
"Failed to apply event: {:?}"
,
e
);
}
}
WorkerTask
::
RemoveWorker
(
worker_id
)
=>
{
self
.remove_or_clear_worker_blocks
(
&
mut
lookup
,
worker_id
,
false
);
}
WorkerTask
::
RemoveWorkerDpRank
(
worker_id
,
dp_rank
)
=>
{
self
.remove_worker_dp_rank
(
&
mut
lookup
,
worker_id
,
dp_rank
);
}
WorkerTask
::
DumpEvents
(
_
sender
)
=>
{
let
_
=
_
sender
.send
(
Ok
(
Vec
::
new
()));
}
WorkerTask
::
Terminate
=>
{
break
;
}
}
}
tracing
::
debug!
(
"ConcurrentRadixTreeCompressed worker thread shutting down"
);
Ok
(())
}
fn
find_matches
(
&
self
,
sequence
:
&
[
LocalBlockHash
],
early_exit
:
bool
)
->
OverlapScores
{
self
.find_matches_impl
(
sequence
,
early_exit
)
}
fn
dump_events
(
&
self
)
->
Option
<
Vec
<
RouterEvent
>>
{
Some
(
self
.dump_tree_as_events
())
}
}
lib/kv-router/src/indexer/mod.rs
View file @
ed4d8068
...
...
@@ -40,6 +40,7 @@ mod traits;
mod
types
;
pub
mod
concurrent_radix_tree
;
pub
mod
concurrent_radix_tree_compressed
;
pub
mod
positional
;
pub
mod
pruning
;
pub
mod
radix_tree
;
...
...
lib/kv-router/src/indexer/tests.rs
View file @
ed4d8068
...
...
@@ -10,6 +10,7 @@ use tokio::time;
use
tokio_util
::
sync
::
CancellationToken
;
use
super
::
concurrent_radix_tree
::
ConcurrentRadixTree
;
use
super
::
concurrent_radix_tree_compressed
::
ConcurrentRadixTreeCompressed
;
use
super
::
positional
::
PositionalIndexer
;
use
super
::
*
;
use
crate
::
protocols
::
*
;
...
...
@@ -204,7 +205,10 @@ fn make_clear_event_with_dp_rank(worker_id: u64, dp_rank: u32) -> RouterEvent {
#[template]
#[rstest]
fn
indexer_template
(
#[values(
"single"
,
"sharded"
,
"flat"
,
"concurrent"
)]
variant
:
&
str
)
{}
fn
indexer_template
(
#[values(
"single"
,
"sharded"
,
"flat"
,
"concurrent"
,
"concurrent_compressed"
)]
variant
:
&
str
,
)
{
}
fn
make_indexer
(
variant
:
&
str
)
->
Box
<
dyn
KvIndexerInterface
>
{
let
token
=
CancellationToken
::
new
();
...
...
@@ -224,6 +228,11 @@ fn make_indexer(variant: &str) -> Box<dyn KvIndexerInterface> {
4
,
kv_block_size
,
)),
"concurrent_compressed"
=>
Box
::
new
(
ThreadPoolIndexer
::
new
(
ConcurrentRadixTreeCompressed
::
new
(),
4
,
kv_block_size
,
)),
_
=>
panic!
(
"Unknown variant: {}"
,
variant
),
}
}
...
...
lib/kv-router/src/indexer/thread_pool.rs
View file @
ed4d8068
...
...
@@ -123,6 +123,28 @@ impl<T: SyncIndexer> ThreadPoolIndexer<T> {
}
}
impl
<
T
:
SyncIndexer
>
Drop
for
ThreadPoolIndexer
<
T
>
{
fn
drop
(
&
mut
self
)
{
// Send Terminate to all worker threads so they exit their recv loops
// and drop their Arc<T> clones. Then join the threads to ensure the
// clones are actually dropped before the compiler drops `self.backend`.
// Without this, worker threads may still be alive when `backend` drops,
// keeping the Arc refcount > 0 and preventing T::drop() from running.
for
channel
in
self
.worker_event_channels
.iter
()
{
let
_
=
channel
.send
(
WorkerTask
::
Terminate
);
}
let
handles
=
std
::
mem
::
take
(
&
mut
*
self
.thread_handles
.lock
()
.expect
(
"thread_handles mutex poisoned"
),
);
for
handle
in
handles
{
let
_
=
handle
.join
();
}
}
}
#[async_trait]
impl
<
T
:
SyncIndexer
>
KvIndexerInterface
for
ThreadPoolIndexer
<
T
>
{
async
fn
find_matches
(
...
...
@@ -217,12 +239,10 @@ impl<T: SyncIndexer> KvIndexerInterface for ThreadPoolIndexer<T> {
}
async
fn
dump_events
(
&
self
)
->
Result
<
Vec
<
RouterEvent
>
,
KvRouterError
>
{
// Fast path: backend can dump directly from shared state (e.g. ConcurrentRadixTree).
if
let
Some
(
events
)
=
self
.backend
.dump_events
()
{
return
Ok
(
events
);
}
// Slow path: collect from each worker thread via channel (e.g. PositionalIndexer).
// Send DumpEvents to every worker as a FIFO barrier: each worker must
// finish processing all previously queued Events before it handles
// DumpEvents, so by the time all workers respond we know the shared
// tree (if any) reflects every event that was enqueued before this call.
let
mut
receivers
=
Vec
::
new
();
for
channel
in
&
self
.worker_event_channels
{
...
...
@@ -235,9 +255,8 @@ impl<T: SyncIndexer> KvIndexerInterface for ThreadPoolIndexer<T> {
receivers
.push
(
resp_rx
);
}
let
mut
event_id_counter
=
0
;
let
mut
all_events
=
Vec
::
new
();
let
mut
event_id_counter
=
0u64
;
for
resp_rx
in
receivers
{
let
mut
events
=
resp_rx
...
...
@@ -251,6 +270,15 @@ impl<T: SyncIndexer> KvIndexerInterface for ThreadPoolIndexer<T> {
all_events
.extend
(
events
);
}
// Shared-state backends keep their tree in concurrent structures
// readable from any thread. Now that the barrier above guarantees
// all queued writes have landed, dump directly.
if
let
Some
(
events
)
=
self
.backend
.dump_events
()
{
return
Ok
(
events
);
}
// Per-thread-state backends returned their events through the DumpEvents
// responses collected above.
Ok
(
all_events
)
}
...
...
lib/kv-router/src/lib.rs
View file @
ed4d8068
...
...
@@ -15,6 +15,7 @@ pub mod zmq_wire;
// Backward-compat re-exports: old top-level module paths still work
pub
use
indexer
::
concurrent_radix_tree
;
pub
use
indexer
::
concurrent_radix_tree_compressed
;
pub
use
indexer
::
positional
as
nested_map
;
pub
use
indexer
::
pruning
as
approx
;
pub
use
indexer
::
radix_tree
;
...
...
@@ -38,6 +39,7 @@ pub use self::multi_worker_sequence::{
};
pub
use
self
::
sequence
::{
ActiveSequences
,
RequestId
};
pub
use
concurrent_radix_tree
::
ConcurrentRadixTree
;
pub
use
concurrent_radix_tree_compressed
::
ConcurrentRadixTreeCompressed
;
pub
use
config
::{
KvRouterConfig
,
RouterConfigOverride
,
RouterQueuePolicy
};
pub
use
event_sink
::
EventSink
;
pub
use
indexer
::{
MaybeError
,
SyncIndexer
,
ThreadPoolIndexer
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment