chore: merge KvIndexer and ApproxKvIndexer (#4500)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

chore: merge KvIndexer and ApproxKvIndexer (#4500)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
c61e0dd3 · Yan Ru Pei · GitHub · 989e246e · c61e0dd3 · c61e0dd3
Unverified Commit c61e0dd3 authored Nov 21, 2025 by Yan Ru Pei Committed by GitHub Nov 21, 2025
14 changed files
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -131,7 +131,25 @@ def parse_args():
        action="store_false",
        dest="use_kv_events",
        default=os.environ.get("DYN_KV_EVENTS", "true").lower() != "false",
-        help="KV Router: Disable KV events. When set, uses ApproxKvRouter for predicting block creation/deletion based only on incoming requests at a timer. By default, KV events are enabled.",
+        help="KV Router: Disable KV events. When set, the router predicts cache state based on routing decisions with TTL-based expiration and pruning, rather than receiving events from workers. By default, KV events are enabled.",
+    )
+    parser.add_argument(
+        "--router-ttl",
+        type=float,
+        default=float(os.environ.get("DYN_ROUTER_TTL", "120.0")),
+        help="KV Router: Time-to-live in seconds for blocks when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_TTL env var (default: 120.0).",
+    )
+    parser.add_argument(
+        "--router-max-tree-size",
+        type=int,
+        default=int(os.environ.get("DYN_ROUTER_MAX_TREE_SIZE", str(2**10))),
+        help="KV Router: Maximum tree size before pruning when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_MAX_TREE_SIZE env var (default: 1024).",
+    )
+    parser.add_argument(
+        "--router-prune-target-ratio",
+        type=float,
+        default=float(os.environ.get("DYN_ROUTER_PRUNE_TARGET_RATIO", "0.8")),
+        help="KV Router: Target size ratio after pruning when KV events are disabled. Only used when --no-kv-events is set. Can be set via DYN_ROUTER_PRUNE_TARGET_RATIO env var (default: 0.8).",
    )
    parser.add_argument(
        "--namespace",
@@ -282,6 +300,9 @@ async def async_main():
            router_snapshot_threshold=flags.router_snapshot_threshold,
            router_reset_states=flags.router_reset_states,
            router_track_active_blocks=flags.router_track_active_blocks,
+            router_ttl_secs=flags.router_ttl,
+            router_max_tree_size=flags.router_max_tree_size,
+            router_prune_target_ratio=flags.router_prune_target_ratio,
        )
    elif flags.router_mode == "random":
        router_mode = RouterMode.Random

--- a/components/src/dynamo/router/__main__.py
+++ b/components/src/dynamo/router/__main__.py
@@ -185,7 +185,7 @@ def parse_args():
        action="store_false",
        dest="use_kv_events",
        default=True,
-        help="KV Router: Disable KV events. When set, uses ApproxKvRouter for predicting block creation/deletion based only on incoming requests. By default, KV events are enabled.",
+        help="KV Router: Disable KV events. When set, the router predicts cache state based on routing decisions with TTL-based expiration and pruning, rather than receiving events from workers. By default, KV events are enabled.",
    )

    parser.add_argument(
@@ -218,6 +218,27 @@ def parse_args():
        help="KV Router: Disable tracking of active blocks (blocks being used for ongoing generation). By default, active blocks are tracked for load balancing (default: True)",
    )

+    parser.add_argument(
+        "--router-ttl-secs",
+        type=float,
+        default=120.0,
+        help="KV Router: TTL for blocks in seconds. Only used when --no-kv-events is set. Controls how long cached blocks are considered valid without explicit events (default: 120.0)",
+    )
+
+    parser.add_argument(
+        "--router-max-tree-size",
+        type=int,
+        default=2**10,
+        help="KV Router: Maximum tree size before pruning. Only used when --no-kv-events is set. When the indexer tree exceeds this size, pruning is triggered (default: 1024)",
+    )
+
+    parser.add_argument(
+        "--router-prune-target-ratio",
+        type=float,
+        default=0.8,
+        help="KV Router: Target size ratio after pruning (0.0-1.0). Only used when --no-kv-events is set. Determines how aggressively to prune the tree (default: 0.8)",
+    )
+
    return parser.parse_args()


@@ -244,7 +265,10 @@ async def worker(runtime: DistributedRuntime):
        f"use_kv_events={args.use_kv_events}, "
        f"router_replica_sync={args.router_replica_sync}, "
        f"router_reset_states={args.router_reset_states}, "
-        f"router_track_active_blocks={args.router_track_active_blocks}"
+        f"router_track_active_blocks={args.router_track_active_blocks}, "
+        f"router_ttl_secs={args.router_ttl_secs}, "
+        f"router_max_tree_size={args.router_max_tree_size}, "
+        f"router_prune_target_ratio={args.router_prune_target_ratio}"
    )

    # Create KvRouter configuration
@@ -256,6 +280,9 @@ async def worker(runtime: DistributedRuntime):
        router_snapshot_threshold=args.router_snapshot_threshold,
        router_reset_states=args.router_reset_states,
        router_track_active_blocks=args.router_track_active_blocks,
+        router_ttl_secs=args.router_ttl_secs,
+        router_max_tree_size=args.router_max_tree_size,
+        router_prune_target_ratio=args.router_prune_target_ratio,
    )

    # Create service component - use "router" as component name

--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@@ -148,7 +148,7 @@ The KV-aware routing arguments:

 - `--router-temperature`: Sets the temperature when randomly selecting workers to route to via softmax sampling on the router cost logits. Setting it to 0 recovers the deterministic behavior where the min logit is picked.

- `--use-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, then we use the `KvIndexer` to listen to the block creation and deletion events. If false, `ApproxKvIndexer`, which assumes the kv cache of historical prompts exists for fixed time durations (hard-coded to 120s), is used to predict the kv cache hit ratio in each engine. Set false if your backend engine does not emit KV events.
+- `--use-kv-events`: Sets whether to listen to KV events for maintaining the global view of cached blocks. If true, the router uses KV events to track block creation and deletion from workers. If false, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Set false if your backend engine does not emit KV events.

 ### Request Migration


--- a/docs/router/README.md
+++ b/docs/router/README.md
@@ -210,8 +210,8 @@ The router uses KV events from workers by default to maintain an accurate global
  - Recommended for production deployments

 - **Without KV Events (--no-kv-events)**:
-  - Uses ApproxKvIndexer to estimate cached blocks from routing decisions
-  - Assumes blocks from recent requests remain cached
+  - Router predicts cache state based on routing decisions with TTL-based expiration and pruning
+  - Tracks blocks from recent requests with configurable time-to-live
  - Reduces overhead at the cost of routing accuracy
  - Suitable for testing or when event processing becomes a bottleneck


--- a/docs/router/kv_cache_routing.md
+++ b/docs/router/kv_cache_routing.md
@@ -21,7 +21,7 @@ The main KV-aware routing arguments:

 - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.

- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses `KvIndexer` to monitor block creation and deletion events. When disabled with this flag, uses `ApproxKvIndexer`, which estimates cache hits based on a fixed time window (120s). Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).
+- `--no-kv-events`: Disables KV event tracking. By default (when this flag is not provided), the router uses KV events to monitor block creation and deletion from workers. When disabled with this flag, the router predicts cache state based on routing decisions with TTL-based expiration (default 120s) and pruning. Use this flag if your backend doesn't support KV events (or you are not confident in the accuracy or responsiveness of the events).

 - `--router-replica-sync`:  Disabled by default. Enables NATS-based synchronization of local routing decisions between router replicas. When enabled, routers share their active sequence information and local predictions of block usage, improving routing consistency across instances. Note that this does not sync the radix tree or cached KV block states themselves - those are synchronized through JetStream events

@@ -33,10 +33,18 @@ The main KV-aware routing arguments:

 - `--busy-threshold`: Threshold (0.0-1.0) for determining when a worker is considered busy based on KV cache usage. When a worker's KV cache active blocks exceed this percentage of total blocks, it will be marked as busy and excluded from routing. If not set, busy detection is disabled. This feature works with all routing modes (`--router-mode kv|round-robin|random`) as long as backend engines emit `ForwardPassMetrics`.

+- `--router-ttl`: Time-to-live in seconds for blocks in the router's local cache predictions. Blocks older than this duration will be automatically expired and removed from the router's radix tree. Defaults to 120.0 seconds when `--no-kv-events` is used. This helps manage memory usage by removing stale cache predictions that are unlikely to be accurate.
+
+- `--router-max-tree-size`: Maximum tree size (number of blocks) before pruning is triggered. When the total number of blocks in the radix tree exceeds this threshold, the router will prune the least recently used blocks. Defaults to 1048576 (2^20 blocks) when `--no-kv-events` is used. This prevents unbounded memory growth in long-running deployments.
+
+- `--router-prune-target-ratio`: Target size ratio to prune down to when `--router-max-tree-size` is exceeded. For example, with a value of 0.8 (default) and max tree size of 1048576, the router will prune down to approximately 838860 blocks when the threshold is exceeded. Defaults to 0.8 when `--no-kv-events` is used. This creates headroom before the next pruning cycle.
+
 >[!Note]
-> State persistence is only available when KV events are enabled (default). When using `--no-kv-events` with `ApproxKvIndexer`, state persistence is not currently supported.
+> State persistence is only available when KV events are enabled (default). When using `--no-kv-events`, state persistence is not currently supported.
 >
 > When `--kv-overlap-score-weight` is set to 0 or `--no-kv-events` is set, no KvIndexer will be launched to drain and process KV events. It's recommended to disable your backend workers from relaying events through `KvEventPublisher` to avoid event accumulation in JetStream. WIP to enable disabling publishing of KV events completely in these cases.
+>
+> The cli args `--router-ttl`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When KV events are enabled (default), the router relies on worker-side eviction events and these parameters are ignored.

 ## Prerequisites and Limitations


--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -69,8 +69,8 @@ pub struct Flags {
    pub router_temperature: Option<f64>,

    /// KV Router: Whether to use KV events to maintain the view of cached blocks
-    /// If false, would use ApproxKvRouter for predicting block creation / deletion
-    /// based only on incoming requests at a timer.
+    /// If false, the router predicts cache state based on routing decisions
+    /// with TTL-based expiration and pruning, rather than receiving events from workers.
    /// Default: true
    #[arg(long)]
    pub use_kv_events: Option<bool>,
@@ -189,6 +189,9 @@ impl Flags {
                // defaulting below args (no longer maintaining new flags for dynamo-run)
                None,
                None,
+                None,
+                None,
+                None,
            ),
        )
    }

--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -452,9 +452,12 @@ pub unsafe extern "C" fn dynamo_create_worker_selection_pipeline(
                (router_temperature >= 0.0).then_some(router_temperature),
                Some(use_kv_events),
                Some(router_replica_sync),
-                None,
-                None,
-                None,
+                None, // track_active_blocks
+                None, // router_snapshot_threshold
+                None, // router_reset_states
+                None, // router_ttl_secs
+                None, // router_max_tree_size
+                None, // router_prune_target_ratio
            ))
        } else {
            None

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -41,7 +41,8 @@ impl KvRouterConfig {
 #[pymethods]
 impl KvRouterConfig {
    #[new]
-    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false, router_track_active_blocks=true, router_snapshot_threshold=1000000, router_reset_states=false))]
+    #[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, router_replica_sync=false, router_track_active_blocks=true, router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1024, router_prune_target_ratio=0.8))]
+    #[allow(clippy::too_many_arguments)]
    fn new(
        overlap_score_weight: f64,
        router_temperature: f64,
@@ -50,6 +51,9 @@ impl KvRouterConfig {
        router_track_active_blocks: bool,
        router_snapshot_threshold: Option<u32>,
        router_reset_states: bool,
+        router_ttl_secs: f64,
+        router_max_tree_size: usize,
+        router_prune_target_ratio: f64,
    ) -> Self {
        KvRouterConfig {
            inner: RsKvRouterConfig {
@@ -60,6 +64,9 @@ impl KvRouterConfig {
                router_track_active_blocks,
                router_snapshot_threshold,
                router_reset_states,
+                router_ttl_secs,
+                router_max_tree_size,
+                router_prune_target_ratio,
            },
        }
    }

--- a/lib/bindings/python/rust/llm/kv.rs
+++ b/lib/bindings/python/rust/llm/kv.rs
@@ -723,33 +723,57 @@ impl KvIndexer {
    }
 }

-/// Bindings for the approximate KV indexer. We need to exactly match the regular KV Indexer
-/// interface, so that the router can switch between the two.
+/// Bindings for the approximate KV indexer. This is a wrapper around KvIndexer
+/// that uses TTL-based expiration and pruning instead of receiving KV events from workers.
 #[pyclass]
 pub(crate) struct ApproxKvIndexer {
-    inner: Arc<llm_rs::kv_router::approx::ApproxKvIndexer>,
+    inner: Arc<llm_rs::kv_router::indexer::KvIndexer>,
 }

 #[pymethods]
 impl ApproxKvIndexer {
    #[new]
-    fn new(component: Component, kv_block_size: usize, ttl_secs: f64) -> PyResult<Self> {
-        let ttl = tokio::time::Duration::from_secs_f64(ttl_secs);
-        let prune_config = Some(llm_rs::kv_router::approx::PruneConfig {
-            max_tree_size: 2usize.pow(20), // 2 ** 20 = 1048576
-            prune_target_ratio: 0.8,
-        });
-        let inner = Arc::new(llm_rs::kv_router::approx::ApproxKvIndexer::new(
-            component.inner.drt().runtime().child_token(),
+    #[pyo3(signature = (component, kv_block_size, router_ttl_secs=120.0, router_max_tree_size=1024, router_prune_target_ratio=0.8))]
+    fn new(
+        component: Component,
+        kv_block_size: usize,
+        router_ttl_secs: f64,
+        router_max_tree_size: usize,
+        router_prune_target_ratio: f64,
+    ) -> PyResult<Self> {
+        let runtime = pyo3_async_runtimes::tokio::get_runtime();
+        runtime.block_on(async {
+            let cancellation_token = component.inner.drt().runtime().child_token();
+            let kv_indexer_metrics =
+                llm_rs::kv_router::indexer::KvIndexerMetrics::from_component(&component.inner);
+
+            // Build PruneConfig with the provided parameters
+            let prune_config = llm_rs::kv_router::approx::PruneConfig {
+                ttl: std::time::Duration::from_secs_f64(router_ttl_secs),
+                max_tree_size: router_max_tree_size,
+                prune_target_ratio: router_prune_target_ratio,
+            };
+
+            // Create KvIndexer with pruning enabled, but DO NOT subscribe to events
+            let inner: Arc<llm_rs::kv_router::indexer::KvIndexer> =
+                llm_rs::kv_router::indexer::KvIndexer::new_with_frequency(
+                    cancellation_token.clone(),
+                    None, // expiration_duration - not used with prune_config
                    kv_block_size as u32,
-            ttl,
-            prune_config,
-        ));
+                    kv_indexer_metrics,
+                    Some(prune_config),
+                )
+                .into();
+
+            // Note: We deliberately do NOT call start_kv_router_background here
+            // because ApproxKvIndexer doesn't use KV events from workers
+
            Ok(Self { inner })
+        })
    }

-    fn block_size(&self) -> u32 {
-        self.inner.block_size()
+    fn block_size(&self) -> usize {
+        self.inner.block_size() as usize
    }

    fn find_matches_for_request<'p>(

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -613,33 +613,78 @@ class KvIndexer:

 class ApproxKvIndexer:
    """
-    A KV Indexer that doesn't use KV cache events. It instead relies solely on the input tokens.
+    An approximate KV Indexer that doesn't receive KV cache events from workers.
+    Instead, it relies on routing decisions with TTL-based expiration and pruning
+    to estimate which blocks are cached on which workers.
+
+    This is useful when:
+    - Backend engines don't emit KV events
+    - You want to reduce event processing overhead
+    - Lower routing accuracy is acceptable
    """

-    def __init__(self, component: Component, kv_block_size: int, ttl_secs: float) -> None:
+    ...
+
+    def __init__(
+        self,
+        component: Component,
+        kv_block_size: int,
+        router_ttl_secs: float = 120.0,
+        router_max_tree_size: int = 1024,
+        router_prune_target_ratio: float = 0.8,
+    ) -> None:
        """
-        Create a `ApproxKvIndexer` object
+        Create an `ApproxKvIndexer` object
+
+        Args:
+            component: The component to associate with this indexer
+            kv_block_size: The KV cache block size
+            router_ttl_secs: TTL for blocks in seconds (default: 120.0)
+            router_max_tree_size: Maximum tree size before pruning (default: 1024)
+            router_prune_target_ratio: Target size ratio after pruning (default: 0.8)
        """
        ...

-    def find_matches_for_request(self, token_ids: List[int], lora_id: int) -> OverlapScores:
+    def find_matches_for_request(
+        self, token_ids: List[int]
+    ) -> OverlapScores:
        """
        Return the overlapping scores of workers for the given token ids.
+
+        Args:
+            token_ids: List of token IDs to find matches for
+
+        Returns:
+            OverlapScores containing worker matching scores and frequencies
        """
        ...

    def block_size(self) -> int:
        """
        Return the block size of the ApproxKvIndexer.
+
+        Returns:
+            The KV cache block size
        """
        ...

-    def process_routing_decision_for_request(self, tokens: List[int], lora_id: int, worker_id: int) -> None:
+    async def process_routing_decision_for_request(
+        self, tokens: List[int], worker_id: int, dp_rank: int = 0
+    ) -> None:
        """
-        Notify the indexer that a token sequence has been sent to a specific worker.
+        Notify the indexer that a token sequence has been routed to a specific worker.
+
+        This updates the indexer's internal state to track which blocks are likely
+        cached on which workers based on routing decisions.
+
+        Args:
+            tokens: List of token IDs that were routed
+            worker_id: The worker ID the request was routed to
+            dp_rank: The data parallel rank (default: 0)
        """
        ...

+
 class KvRecorder:
    """
    A recorder for KV Router events.
@@ -978,6 +1023,35 @@ class RouterConfig:

 class KvRouterConfig:
    """Values for KV router"""
+
+    def __init__(
+        self,
+        overlap_score_weight: float = 1.0,
+        router_temperature: float = 0.0,
+        use_kv_events: bool = True,
+        router_replica_sync: bool = False,
+        router_track_active_blocks: bool = True,
+        router_snapshot_threshold: Optional[int] = 1000000,
+        router_reset_states: bool = False,
+        router_ttl_secs: float = 120.0,
+        router_max_tree_size: int = 1024,
+        router_prune_target_ratio: float = 0.8,
+    ) -> None:
+        """
+        Create a KV router configuration.
+
+        Args:
+            overlap_score_weight: Weight for overlap score in worker selection (default: 1.0)
+            router_temperature: Temperature for worker sampling via softmax (default: 0.0)
+            use_kv_events: Whether to use KV events from workers (default: True)
+            router_replica_sync: Enable replica synchronization (default: False)
+            router_track_active_blocks: Track active blocks for load balancing (default: True)
+            router_snapshot_threshold: Number of messages before snapshot (default: 1000000)
+            router_reset_states: Reset router state on startup (default: False)
+            router_ttl_secs: TTL for blocks in seconds when not using KV events (default: 120.0)
+            router_max_tree_size: Maximum tree size before pruning (default: 1024)
+            router_prune_target_ratio: Target size ratio after pruning (default: 0.8)
+        """
        ...

 async def register_llm(

--- a/lib/bindings/python/tests/test_kv_bindings.py
+++ b/lib/bindings/python/tests/test_kv_bindings.py
@@ -248,27 +248,32 @@ async def test_event_handler(distributed_runtime):

 @pytest.mark.asyncio
 async def test_approx_kv_indexer(distributed_runtime):
+    """Test ApproxKvIndexer with TTL-based block tracking"""
    kv_block_size = 32
    namespace = "kv_test"
    component = "approx_kv"
    kv_listener = distributed_runtime.namespace(namespace).component(component)

-    indexer = ApproxKvIndexer(kv_listener, kv_block_size, 30.0)
+    # Create ApproxKvIndexer with default TTL (120s)
+    indexer = ApproxKvIndexer(kv_listener, kv_block_size)

    tokens = [0] * (kv_block_size * 2)

+    # Initially no matches
    scores = await indexer.find_matches_for_request(tokens)
    assert not scores.scores

    worker_id = 0

+    # Process routing decision - this should add blocks to the indexer
    await indexer.process_routing_decision_for_request(tokens, worker_id)

+    # Now we should have matches
    scores = await indexer.find_matches_for_request(tokens)
    assert scores.scores
    worker_key = (worker_id, 0)  # (worker_id, dp_rank)
    assert worker_key in scores.scores
-    assert scores.scores[worker_key] == 2
+    assert scores.scores[worker_key] == 2  # 2 blocks (tokens is 2 blocks long)


 class EventPublisher:

--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -36,7 +36,6 @@ pub use prefill_router::PrefillRouter;

 use crate::{
    kv_router::{
-        approx::ApproxKvIndexer,
        approx::PruneConfig,
        indexer::{
            KvIndexer, KvIndexerInterface, KvRouterError, OverlapScores, RouterEvent,
@@ -53,6 +52,7 @@ use crate::{
    model_card::ModelDeploymentCard,
    preprocessor::PreprocessedRequest,
    protocols::common::llm_backend::LLMEngineOutput,
+    tokens::SequenceHash,
 };

 // [gluo TODO] shouldn't need to be public
@@ -113,6 +113,15 @@ pub struct KvRouterConfig {

    /// Whether to reset the router state on startup (default: false)
    pub router_reset_states: bool,
+
+    /// TTL for blocks in seconds (only used when use_kv_events is false, default: 120.0)
+    pub router_ttl_secs: f64,
+
+    /// Maximum tree size before pruning (only used when use_kv_events is false, default: 1024)
+    pub router_max_tree_size: usize,
+
+    /// Target size ratio after pruning (only used when use_kv_events is false, default: 0.8)
+    pub router_prune_target_ratio: f64,
 }

 impl Default for KvRouterConfig {
@@ -125,6 +134,9 @@ impl Default for KvRouterConfig {
            router_track_active_blocks: true,
            router_snapshot_threshold: Some(1000000),
            router_reset_states: false,
+            router_ttl_secs: 120.0,
+            router_max_tree_size: 1024,
+            router_prune_target_ratio: 0.8,
        }
    }
 }
@@ -141,6 +153,9 @@ impl KvRouterConfig {
        track_active_blocks: Option<bool>,
        router_snapshot_threshold: Option<Option<u32>>,
        router_reset_states: Option<bool>,
+        router_ttl_secs: Option<f64>,
+        router_max_tree_size: Option<usize>,
+        router_prune_target_ratio: Option<f64>,
    ) -> Self {
        let default = Self::default();
        Self {
@@ -153,21 +168,20 @@ impl KvRouterConfig {
            router_snapshot_threshold: router_snapshot_threshold
                .unwrap_or(default.router_snapshot_threshold),
            router_reset_states: router_reset_states.unwrap_or(default.router_reset_states),
+            router_ttl_secs: router_ttl_secs.unwrap_or(default.router_ttl_secs),
+            router_max_tree_size: router_max_tree_size.unwrap_or(default.router_max_tree_size),
+            router_prune_target_ratio: router_prune_target_ratio
+                .unwrap_or(default.router_prune_target_ratio),
        }
    }
 }

-// TODO: is there a way (macro) to auto-derive the KvIndexerInterface trait for this
-// since both variants implement it
 pub enum Indexer {
-    /// Updates itself based on KV events emitted by backend workers.
+    /// Updates itself based on KV events emitted by backend workers or routing decisions.
+    /// Supports TTL-based expiration and size-based pruning.
    /// Has the ability to persist and snapshot states.
    KvIndexer(KvIndexer),

-    /// Predicts the cached blocks based on requests on a TTL basis.
-    /// Currently does not persist or snapshot states (WIP to enable that).
-    ApproxKvIndexer(ApproxKvIndexer),
-
    /// Used when we do not wish to use the indexer at all (e.g., when overlap_score_weight is 0).
    /// Note: This will cause KV events to accumulate in JetStream as we do not regularly purge them.
    None,
@@ -180,7 +194,6 @@ impl Indexer {
    ) -> Result<OverlapScores, KvRouterError> {
        match self {
            Indexer::KvIndexer(indexer) => indexer.find_matches(sequence).await,
-            Indexer::ApproxKvIndexer(indexer) => indexer.find_matches(sequence).await,
            Indexer::None => Ok(OverlapScores {
                scores: HashMap::new(),
                frequencies: Vec::new(),
@@ -192,7 +205,6 @@ impl Indexer {
    async fn dump_events(&self) -> Result<Vec<RouterEvent>, KvRouterError> {
        match self {
            Indexer::KvIndexer(indexer) => indexer.dump_events().await,
-            Indexer::ApproxKvIndexer(indexer) => indexer.dump_events().await,
            Indexer::None => {
                panic!(
                    "Cannot dump events: indexer does not exist (is overlap_score_weight set to 0?)"
@@ -200,6 +212,22 @@ impl Indexer {
            }
        }
    }
+
+    async fn process_routing_decision(
+        &self,
+        worker: WorkerWithDpRank,
+        local_hashes: Vec<LocalBlockHash>,
+        sequence_hashes: Vec<SequenceHash>,
+    ) -> Result<(), KvRouterError> {
+        match self {
+            Indexer::KvIndexer(indexer) => {
+                indexer
+                    .process_routing_decision(worker, local_hashes, sequence_hashes)
+                    .await
+            }
+            Indexer::None => Ok(()),
+        }
+    }
 }

 /// A KvRouter only decides which worker you should use. It doesn't send you there.
@@ -253,23 +281,26 @@ impl KvRouter {
        let indexer = if kv_router_config.overlap_score_weight == 0.0 {
            // When overlap_score_weight is zero, we don't need to track prefixes
            Indexer::None
-        } else if kv_router_config.use_kv_events {
+        } else {
            let kv_indexer_metrics = indexer::KvIndexerMetrics::from_component(component);
-            Indexer::KvIndexer(KvIndexer::new(
-                cancellation_token.clone(),
-                block_size,
-                kv_indexer_metrics,
-            ))
+
+            // If use_kv_events is false, enable TTL and pruning for approximate behavior
+            let prune_config = if !kv_router_config.use_kv_events {
+                Some(PruneConfig {
+                    ttl: Duration::from_secs_f64(kv_router_config.router_ttl_secs),
+                    max_tree_size: kv_router_config.router_max_tree_size,
+                    prune_target_ratio: kv_router_config.router_prune_target_ratio,
+                })
            } else {
-            // hard code 120 seconds for now
-            Indexer::ApproxKvIndexer(ApproxKvIndexer::new(
+                None
+            };
+
+            Indexer::KvIndexer(KvIndexer::new_with_frequency(
                cancellation_token.clone(),
+                None, // expiration_duration for frequency tracking
                block_size,
-                Duration::from_secs(120),
-                Some(PruneConfig {
-                    max_tree_size: 2usize.pow(20), // 2 ** 20 = 1048576
-                    prune_target_ratio: 0.8,
-                }),
+                kv_indexer_metrics,
+                prune_config,
            ))
        };

@@ -284,8 +315,10 @@ impl KvRouter {
        )
        .await?;

-        // Start unified background process if using KvIndexer
-        if let Indexer::KvIndexer(ref kv_indexer) = indexer {
+        // Start KV event subscriber background process (only when use_kv_events is enabled)
+        if kv_router_config.use_kv_events
+            && let Indexer::KvIndexer(ref kv_indexer) = indexer
+        {
            start_kv_router_background(
                component.clone(),
                consumer_uuid,
@@ -343,12 +376,12 @@ impl KvRouter {
        let overlap_scores = self.indexer.find_matches(block_hashes.clone()).await?;

        // Determine who needs seq_hashes
-        let approx_indexer_needs_it = matches!(self.indexer, Indexer::ApproxKvIndexer(_));
+        let needs_process_routing = !self.kv_router_config.use_kv_events;
        let scheduler_needs_it = self.kv_router_config.router_track_active_blocks;

        // Optimize cloning: only clone if both need it, otherwise move
        let (maybe_seq_hashes_1, maybe_seq_hashes_2) =
-            match (approx_indexer_needs_it, scheduler_needs_it) {
+            match (needs_process_routing, scheduler_needs_it) {
                (true, true) => (Some(seq_hashes.clone()), Some(seq_hashes)),
                (true, false) => (Some(seq_hashes), None),
                (false, true) => (None, Some(seq_hashes)),
@@ -367,12 +400,12 @@ impl KvRouter {
            )
            .await?;

-        if let Indexer::ApproxKvIndexer(ref indexer) = self.indexer {
-            indexer
+        // Process routing decision when not using KV events (approximate mode with TTL/pruning)
+        if needs_process_routing {
+            self.indexer
                .process_routing_decision(best_worker, block_hashes, maybe_seq_hashes_1.unwrap())
-                .await
-                .unwrap();
-        };
+                .await?;
+        }

        let overlap_amount = overlap_scores
            .scores

--- a/lib/llm/src/kv_router/approx.rs
+++ b/lib/llm/src/kv_router/approx.rs
--- a/lib/llm/src/kv_router/indexer.rs
+++ b/lib/llm/src/kv_router/indexer.rs