chore(kv-router): remove sharded indexer path (#8041)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore(kv-router): remove sharded indexer path (#8041)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
02666f04 · Yan Ru Pei · GitHub · ba274a03 · 02666f04 · 02666f04
Unverified Commit 02666f04 authored Apr 09, 2026 by Yan Ru Pei Committed by GitHub Apr 09, 2026
7 changed files
--- a/lib/bench/kv_router/kv_indexer_bench.rs
+++ b/lib/bench/kv_router/kv_indexer_bench.rs
 // SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
-//! Combined benchmark for KvIndexer, KvIndexerSharded, and PositionalIndexer (nested).
+//! Combined benchmark for KvIndexer, PositionalIndexer (nested), and ConcurrentRadixTree.
 //!
 //! Provides two modes:
 //! - `microbench`: Per-operation latency benchmarks comparing indexer implementations
 //! - `stress`: Queue saturation stress test under load
 //!
-//! Supported indexer types: single, sharded, nested, all
+//! Supported indexer types: single, nested, concurrent, all
 //!
 //! Run with:
 //!   cargo bench --package dynamo-bench --bench kv_indexer_bench -- microbench --help
@@ -21,9 +21,7 @@ use clap::{Args, Parser, Subcommand, ValueEnum};
 use dynamo_bench::common::LatencyStats;
 use dynamo_kv_router::{
    ConcurrentRadixTree,
-    indexer::{
+    indexer::{KvIndexer, KvIndexerInterface, KvIndexerMetrics, ThreadPoolIndexer},
-        KvIndexer, KvIndexerInterface, KvIndexerMetrics, KvIndexerSharded, ThreadPoolIndexer,
-    },
    nested_map::PositionalIndexer,
    protocols::{LocalBlockHash, RouterEvent},
 };
@@ -40,7 +38,7 @@ use tokio_util::sync::CancellationToken;
 #[derive(Parser)]
 #[command(name = "kv_indexer_bench")]
-#[command(about = "Combined benchmark for KvIndexer, KvIndexerSharded, and PositionalIndexer")]
+#[command(about = "Combined benchmark for KvIndexer, PositionalIndexer, and ConcurrentRadixTree")]
 struct Cli {
    #[command(subcommand)]
    command: Command,
@@ -63,8 +61,6 @@ enum Command {
 enum IndexerType {
    /// Non-sharded KvIndexer (single background thread)
    Single,
-    /// Sharded KvIndexer (multiple shards with separate trees)
-    Sharded,
    /// Nested PositionalIndexer (position-based HashMap with jump search)
    Nested,
    /// Concurrent radix tree (lock-per-node with DashMap lookup)
@@ -122,9 +118,9 @@ struct MicrobenchArgs {
    #[arg(long, value_enum, default_value = "all")]
    indexer_type: IndexerType,
-    /// Number of shards for sharded indexer
+    /// Number of event worker threads for nested/concurrent indexers
    #[arg(long, default_value = "4")]
-    num_shards: usize,
+    num_event_workers: usize,
    /// Jump size for nested/positional indexer
    #[arg(long, default_value = "32")]
@@ -164,9 +160,9 @@ struct StressArgs {
    #[arg(long, value_enum, default_value = "single")]
    indexer_type: IndexerType,
-    /// Number of shards for sharded indexer
+    /// Number of event worker threads for nested/concurrent indexers
    #[arg(long, default_value = "4")]
-    num_shards: usize,
+    num_event_workers: usize,
    /// Jump size for nested/positional indexer
    #[arg(long, default_value = "32")]
@@ -177,7 +173,7 @@ struct StressArgs {
 // Benchable Indexer Trait
 // ============================================================================
-/// Trait for abstracting over KvIndexer and KvIndexerSharded
+/// Trait for abstracting over benchmarked indexers
 #[async_trait::async_trait]
 trait BenchableIndexer: Send + Sync {
    async fn apply_event(&mut self, event: RouterEvent);
@@ -207,25 +203,6 @@ impl BenchableIndexer for KvIndexer {
    }
 }
-#[async_trait::async_trait]
-impl BenchableIndexer for KvIndexerSharded {
-    async fn apply_event(&mut self, event: RouterEvent) {
-        KvIndexerInterface::apply_event(self, event).await;
-    }
-    async fn find_matches(
-        &self,
-        sequence: Vec<LocalBlockHash>,
-    ) -> Result<(), dynamo_kv_router::indexer::KvRouterError> {
-        KvIndexerInterface::find_matches(self, sequence).await?;
-        Ok(())
-    }
-    fn name(&self) -> &str {
-        "KvIndexerSharded"
-    }
-}
 #[async_trait::async_trait]
 impl BenchableIndexer for ThreadPoolIndexer<PositionalIndexer> {
    async fn apply_event(&mut self, event: RouterEvent) {
@@ -697,6 +674,16 @@ async fn run_microbench_mode(args: MicrobenchArgs) {
        eprintln!("Error: size must be >= depth");
        std::process::exit(1);
    }
+    if matches!(
+        args.indexer_type,
+        IndexerType::Nested | IndexerType::Concurrent | IndexerType::All
+    ) && args.num_event_workers == 0
+    {
+        eprintln!(
+            "Error: num_event_workers must be > 0 when using Nested, Concurrent, or All indexer type"
+        );
+        std::process::exit(1);
+    }
    println!("KvIndexer Microbenchmark");
    println!("========================\n");
@@ -716,7 +703,7 @@ async fn run_microbench_mode(args: MicrobenchArgs) {
        args.prefix_prompt_ratio * 100.0
    );
    println!("  Prefix prompt groups: {}", args.num_prefix_prompts);
-    println!("  Num shards (for sharded): {}", args.num_shards);
+    println!("  Event worker threads: {}", args.num_event_workers);
    println!("  Indexer type: {:?}", args.indexer_type);
    println!("  Benchmark type: {}", args.benchmark_type);
    println!(
@@ -751,26 +738,11 @@ async fn run_microbench_mode(args: MicrobenchArgs) {
        tokio::time::sleep(Duration::from_millis(50)).await;
    }
-    // Benchmark sharded indexer
-    if matches!(args.indexer_type, IndexerType::Sharded | IndexerType::All) {
-        let token = CancellationToken::new();
-        let mut indexer = KvIndexerSharded::new(
-            token.clone(),
-            args.num_shards,
-            args.common.block_size,
-            metrics.clone(),
-        );
-        let result = run_microbenchmarks(&mut indexer, sequences, extra_sequences, &args).await;
-        results.push(result);
-        token.cancel();
-        tokio::time::sleep(Duration::from_millis(50)).await;
-    }
    // Benchmark nested indexer
    if matches!(args.indexer_type, IndexerType::Nested | IndexerType::All) {
        let mut indexer = ThreadPoolIndexer::new(
            PositionalIndexer::new(args.jump_size),
-            args.num_shards,
+            args.num_event_workers,
            args.common.block_size,
        );
        let result = run_microbenchmarks(&mut indexer, sequences, extra_sequences, &args).await;
@@ -786,7 +758,7 @@ async fn run_microbench_mode(args: MicrobenchArgs) {
    ) {
        let mut indexer = ThreadPoolIndexer::new(
            ConcurrentRadixTree::new(),
-            args.num_shards,
+            args.num_event_workers,
            args.common.block_size,
        );
        let result = run_microbenchmarks(&mut indexer, sequences, extra_sequences, &args).await;
@@ -1226,10 +1198,12 @@ async fn run_stress_mode(args: StressArgs) {
    }
    if matches!(
        args.indexer_type,
-        IndexerType::Sharded | IndexerType::Nested | IndexerType::All
+        IndexerType::Nested | IndexerType::Concurrent | IndexerType::All
-    ) && args.num_shards == 0
+    ) && args.num_event_workers == 0
    {
-        eprintln!("Error: num_shards must be > 0 when using Sharded, Nested, or All indexer type");
+        eprintln!(
+            "Error: num_event_workers must be > 0 when using Nested, Concurrent, or All indexer type"
+        );
        std::process::exit(1);
    }
@@ -1254,11 +1228,13 @@ async fn run_stress_mode(args: StressArgs) {
    println!("  Duration: {}s", args.duration);
    println!("  In-flight timeout: {}s", args.in_flight_timeout);
    println!("  Indexer type: {:?}", args.indexer_type);
-    if matches!(args.indexer_type, IndexerType::Sharded | IndexerType::All) {
+    if matches!(
-        println!("  Num shards (sharded): {}", args.num_shards);
+        args.indexer_type,
+        IndexerType::Nested | IndexerType::Concurrent | IndexerType::All
+    ) {
+        println!("  Event worker threads: {}", args.num_event_workers);
    }
    if matches!(args.indexer_type, IndexerType::Nested | IndexerType::All) {
-        println!("  Num workers (nested): {}", args.num_shards);
        println!("  Jump size (nested): {}", args.jump_size);
    }
@@ -1322,58 +1298,11 @@ async fn run_stress_mode(args: StressArgs) {
        tokio::time::sleep(Duration::from_millis(50)).await;
    }
-    // Test sharded indexer
-    if matches!(args.indexer_type, IndexerType::Sharded | IndexerType::All) {
-        let token = CancellationToken::new();
-        let indexer = KvIndexerSharded::new(
-            token.clone(),
-            args.num_shards,
-            args.common.block_size,
-            metrics.clone(),
-        );
-        println!(
-            "\n  Applying {} store events to KvIndexerSharded...",
-            sequences.len()
-        );
-        let construction_start = Instant::now();
-        for (event_id, seq) in sequences.iter().enumerate() {
-            let event = seq.to_store_event(event_id as u64);
-            KvIndexerInterface::apply_event(&indexer, event).await;
-            if args.common.verbose && (event_id + 1) % 100 == 0 {
-                println!("    Applied {}/{} events...", event_id + 1, sequences.len());
-            }
-        }
-        let construction_time = construction_start.elapsed();
-        let construction_events = sequences.len() as u64;
-        println!("  Tree construction completed in {:?}", construction_time);
-        println!(
-            "  Throughput: {:.0} events/sec",
-            construction_events as f64 / construction_time.as_secs_f64()
-        );
-        tokio::time::sleep(Duration::from_millis(100)).await;
-        let mut results = run_stress_test(Arc::new(indexer), &sequences, &args).await;
-        results.construction_time = construction_time;
-        results.construction_events = construction_events;
-        print_stress_results(&args, &results);
-        all_results.push(results);
-        token.cancel();
-        tokio::time::sleep(Duration::from_millis(50)).await;
-    }
    // Test nested indexer
    if matches!(args.indexer_type, IndexerType::Nested | IndexerType::All) {
        let indexer = ThreadPoolIndexer::new(
            PositionalIndexer::new(args.jump_size),
-            args.num_shards,
+            args.num_event_workers,
            args.common.block_size,
        );
@@ -1425,7 +1354,7 @@ async fn run_stress_mode(args: StressArgs) {
    ) {
        let indexer = ThreadPoolIndexer::new(
            ConcurrentRadixTree::new(),
-            args.num_shards,
+            args.num_event_workers,
            args.common.block_size,
        );

--- a/lib/bench/kv_router/mooncake_bench.rs
+++ b/lib/bench/kv_router/mooncake_bench.rs
@@ -7,9 +7,7 @@ use common::*;
 use clap::{Parser, Subcommand};
 use dynamo_kv_router::LocalBlockHash;
-use dynamo_kv_router::indexer::{
+use dynamo_kv_router::indexer::{KvIndexer, KvIndexerInterface, KvIndexerMetrics};
-    KvIndexer, KvIndexerInterface, KvIndexerMetrics, KvIndexerSharded,
-};
 use dynamo_kv_router::protocols::{KvCacheEvent, KvCacheEventData, RouterEvent};
 use dynamo_kv_router::{
    ConcurrentRadixTree, ConcurrentRadixTreeCompressed, PositionalIndexer, ThreadPoolIndexer,
@@ -26,13 +24,6 @@ enum IndexerArgs {
    /// Single-threaded radix tree indexer.
    RadixTree {},
-    /// Sharded radix tree indexer that partitions workers across independent shards.
-    RadixTreeSharded {
-        /// Number of independent shards to split workers across.
-        #[clap(long, default_value = "4")]
-        num_shards: usize,
-    },
    /// Position-based nested map indexer with jump search.
    NestedMap {
        /// Number of positions to skip during jump search before scanning back.
@@ -68,12 +59,6 @@ impl IndexerArgs {
            IndexerArgs::RadixTree {} => {
                Arc::new(KvIndexer::new(cancel_token, block_size, metrics))
            }
-            IndexerArgs::RadixTreeSharded { num_shards } => Arc::new(KvIndexerSharded::new(
-                cancel_token,
-                num_shards,
-                block_size,
-                metrics,
-            )),
            IndexerArgs::NestedMap {
                jump_size,
                num_event_workers,
@@ -115,7 +100,6 @@ impl IndexerArgs {
        let nw = num_event_workers;
        let indexer_args = match name {
            "radix-tree" => IndexerArgs::RadixTree {},
-            "radix-tree-sharded" => IndexerArgs::RadixTreeSharded { num_shards: 4 },
            "nested-map" => IndexerArgs::NestedMap {
                jump_size: 8,
                num_event_workers: nw,
@@ -127,7 +111,7 @@ impl IndexerArgs {
                num_event_workers: nw,
            },
            _ => anyhow::bail!(
-                "Unknown indexer '{}'. Valid names: radix-tree, radix-tree-sharded, \
+                "Unknown indexer '{}'. Valid names: radix-tree, \
                 nested-map, concurrent-radix-tree, concurrent-radix-tree-compressed",
                name
            ),
@@ -148,14 +132,15 @@ struct Args {
    /// Comma-separated list of indexer names to benchmark and compare on the
    /// same plot. Overrides the subcommand indexer when present. Valid names:
-    /// radix-tree, radix-tree-sharded, nested-map, concurrent-radix-tree,
+    /// radix-tree, nested-map, concurrent-radix-tree,
    /// concurrent-radix-tree-compressed.
    #[clap(long, value_delimiter = ',')]
    compare: Vec<String>,
    /// Number of OS threads for event processing in compare mode. Applies to
-    /// indexers that use a thread pool (nested-map, concurrent-radix-tree).
+    /// indexers that use a thread pool (nested-map, concurrent-radix-tree,
-    /// Ignored by radix-tree and radix-tree-sharded.
+    /// concurrent-radix-tree-compressed).
+    /// Ignored by radix-tree.
    #[clap(long, default_value = "16")]
    num_event_workers: usize,
@@ -555,7 +540,6 @@ async fn main() -> anyhow::Result<()> {
    let indexer_names: Vec<String> = if args.compare.is_empty() {
        let name = match args.get_indexer() {
            IndexerArgs::RadixTree {} => "radix-tree",
-            IndexerArgs::RadixTreeSharded { .. } => "radix-tree-sharded",
            IndexerArgs::NestedMap { .. } => "nested-map",
            IndexerArgs::ConcurrentRadixTree { .. } => "concurrent-radix-tree",
            IndexerArgs::ConcurrentRadixTreeCompressed { .. } => "concurrent-radix-tree-compressed",

--- a/lib/kv-router/src/indexer/README.md
+++ b/lib/kv-router/src/indexer/README.md
@@ -17,7 +17,6 @@ The concurrent indexers achieve a combined throughput of over **10 million event
 | `concurrent_radix_tree.rs` | `ConcurrentRadixTree` — thread-safe variant with `Arc<RwLock<Block>>` nodes and `DashMap` lookup |
 | `positional.rs` | `PositionalIndexer` — flat `DashMap<(pos, hash), SeqEntry>` with jump optimization |
 | `thread_pool.rs` | `ThreadPoolIndexer<T: SyncIndexer>` — N OS threads for sticky-routed writes, inline reads; wraps `ConcurrentRadixTree` or `PositionalIndexer` |
-| `sharded.rs` | `KvIndexerSharded` — N independent `RadixTree` shards each in its own OS thread, scatter-gather for matches |
 | `local.rs` | `LocalKvIndexer` — thin wrapper around `KvIndexer` with a circular event buffer for worker-side decentralized routing |
 | `pruning.rs` | `PruneManager` — TTL-based expiration and size-based pruning via `BinaryHeap<BlockEntry>` |
 | `naive.rs` | Brute-force baseline indexers (bench-only, behind `bench` feature flag) |

--- a/lib/kv-router/src/indexer/mod.rs
+++ b/lib/kv-router/src/indexer/mod.rs
@@ -34,7 +34,6 @@
 mod kv_indexer;
 mod local;
 mod metrics;
-mod sharded;
 mod thread_pool;
 mod traits;
 mod types;
@@ -52,7 +51,6 @@ mod tests;
 pub use kv_indexer::*;
 pub use local::*;
 pub use metrics::*;
-pub use sharded::*;
 pub use thread_pool::*;
 pub use traits::*;
 pub use types::*;

--- a/lib/kv-router/src/indexer/sharded.rs
+++ b/lib/kv-router/src/indexer/sharded.rs
--- a/lib/kv-router/src/indexer/tests.rs
+++ b/lib/kv-router/src/indexer/tests.rs
@@ -207,14 +207,14 @@ fn make_clear_event_with_dp_rank(worker_id: u64, dp_rank: u32) -> RouterEvent {
 #[template]
 #[rstest]
 fn indexer_template(
-    #[values("single", "sharded", "flat", "concurrent", "concurrent_compressed")] variant: &str,
+    #[values("single", "flat", "concurrent", "concurrent_compressed")] variant: &str,
 ) {
 }
 #[template]
 #[rstest]
 fn tree_size_indexer_template(
-    #[values("single", "sharded", "concurrent", "concurrent_compressed")] variant: &str,
+    #[values("single", "concurrent", "concurrent_compressed")] variant: &str,
 ) {
 }
@@ -225,7 +225,6 @@ fn make_indexer(variant: &str) -> Box<dyn KvIndexerInterface> {
    match variant {
        "single" => Box::new(KvIndexer::new(token, kv_block_size, metrics)),
-        "sharded" => Box::new(KvIndexerSharded::new(token, 4, kv_block_size, metrics)),
        "flat" => Box::new(ThreadPoolIndexer::new(
            PositionalIndexer::new(32),
            4,
@@ -330,7 +329,7 @@ mod interface_tests {
        // tree-size accounting gap after mid-chain removes because descendant
        // lookup entries are cleaned up lazily. That means "store -> partial
        // remove -> restore continuation" can still miscount restored coverage
-        // in single, sharded, and concurrent. This test is intentionally scoped
+        // in single and concurrent. This test is intentionally scoped
        // to duplicate store/remove replay so all tree-size variants share the
        // same stable baseline.
@@ -1854,13 +1853,13 @@ mod long_sequence_tests {
 }
 // ============================================================================
-// Tests specific to tree-based implementations (KvIndexer, KvIndexerSharded)
+// Tests specific to tree-based implementations with frequency/pruning support.
 // These use features not available in PositionalIndexer
 // ============================================================================
 #[template]
 #[rstest]
-fn tree_indexer_template(#[values("single", "sharded")] variant: &str) {}
+fn tree_indexer_template(#[values("single")] variant: &str) {}
 fn make_tree_indexer_with_frequency(
    variant: &str,
@@ -1878,25 +1877,16 @@ fn make_tree_indexer_with_frequency(
            metrics,
            None,
        )),
-        "sharded" => Box::new(KvIndexerSharded::new_with_frequency(
-            token,
-            4,
-            Some(expiration),
-            kv_block_size,
-            metrics,
-            None,
-        )),
        _ => panic!("Unknown variant: {}", variant),
    }
 }
 #[tokio::test]
-async fn test_sharded_routing_decision_assigns_first_seen_worker() {
+async fn test_routing_decision_assigns_first_seen_worker() {
    let token = CancellationToken::new();
    let metrics = Arc::new(KvIndexerMetrics::new_unregistered());
-    let index = KvIndexerSharded::new_with_frequency(
+    let index = KvIndexer::new_with_frequency(
        token,
-        4,
        Some(Duration::from_secs(60)),
        32,
        metrics,

--- a/lib/kv-router/src/indexer/types.rs
+++ b/lib/kv-router/src/indexer/types.rs
@@ -5,7 +5,7 @@
 use std::time::Instant;
 use serde::{Deserialize, Serialize};
-use tokio::sync::{mpsc, oneshot};
+use tokio::sync::oneshot;
 use crate::protocols::*;
 use dynamo_tokens::SequenceHash;
@@ -307,28 +307,3 @@ pub(super) struct RoutingDecisionRequest {
    pub(super) local_hashes: Vec<LocalBlockHash>,
    pub(super) sequence_hashes: Vec<SequenceHash>,
 }
-#[derive(Debug, Clone)]
-pub struct ShardedMatchRequest {
-    pub(super) sequence: Vec<LocalBlockHash>,
-    pub(super) early_exit: bool,
-    pub(super) resp: mpsc::Sender<OverlapScores>,
-    #[cfg(feature = "bench")]
-    pub(super) created_at: Instant,
-}
-impl ShardedMatchRequest {
-    pub(super) fn new(
-        sequence: Vec<LocalBlockHash>,
-        early_exit: bool,
-        resp: mpsc::Sender<OverlapScores>,
-    ) -> Self {
-        Self {
-            sequence,
-            early_exit,
-            resp,
-            #[cfg(feature = "bench")]
-            created_at: Instant::now(),
-        }
-    }
-}