feat: (Router) Guard threshold-based rejection logic when CLI thresholds are unset (#8333)

adfc02d5 · Karen Chung · GitHub · 8fd7de9a · adfc02d5 · adfc02d5
Unverified Commit adfc02d5 authored Apr 20, 2026 by Karen Chung Committed by GitHub Apr 20, 2026
8 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -45,6 +45,20 @@ def validate_model_path(value: str) -> str:
    return value


+def _nullable_float(value: str) -> Optional[float]:
+    """Parse a float, or return None for the literal 'None'."""
+    if value is None or value == "None":
+        return None
+    return float(value)
+
+
+def _nullable_int(value: str) -> Optional[int]:
+    """Parse an int, or return None for the literal 'None'."""
+    if value is None or value == "None":
+        return None
+    return int(value)
+
+
 class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
    """Configuration for the Dynamo frontend."""

@@ -331,36 +345,37 @@ class FrontendArgGroup(ArgGroup):
            g,
            flag_name="--active-decode-blocks-threshold",
            env_var="DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD",
-            default=None,
+            default=1.0,
            help=(
-                "Threshold percentage (0.0-1.0) for determining when a worker is considered busy "
-                "based on KV cache block utilization. If not set, blocks-based busy detection is disabled."
+                "Threshold fraction (0.0-1.0) of KV cache block utilization above which a worker "
+                "is considered busy. Pass 'None' on the CLI to disable this check. Default: 1.0."
            ),
-            arg_type=float,
+            arg_type=_nullable_float,
        )
        add_argument(
            g,
            flag_name="--active-prefill-tokens-threshold",
            env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD",
-            default=None,
+            default=10_000_000,
            help=(
                "Literal token count threshold for determining when a worker is considered busy "
                "based on prefill token utilization. When active prefill tokens exceed this "
-                "threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled."
+                "threshold, the worker is marked as busy. Pass 'None' on the CLI to disable this "
+                "check. Uses OR logic with --active-prefill-tokens-threshold-frac. Default: 10000000."
            ),
-            arg_type=int,
+            arg_type=_nullable_int,
        )
        add_argument(
            g,
            flag_name="--active-prefill-tokens-threshold-frac",
            env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC",
-            default=None,
+            default=10.0,
            help=(
                "Fraction of max_num_batched_tokens for busy detection. Worker is busy when "
-                "active_prefill_tokens > frac * max_num_batched_tokens. Default 1.5 (disabled). "
-                "Uses OR logic with --active-prefill-tokens-threshold."
+                "active_prefill_tokens > frac * max_num_batched_tokens. Pass 'None' on the CLI to "
+                "disable this check. Uses OR logic with --active-prefill-tokens-threshold. Default: 10.0."
            ),
-            arg_type=float,
+            arg_type=_nullable_float,
        )
        add_argument(
            g,

--- a/docs/components/frontend/configuration.md
+++ b/docs/components/frontend/configuration.md
@@ -68,9 +68,9 @@ When enabled, the frontend's embedded KV router predicts one expected prefill du
 | CLI Argument | Env Var | Default | Description |
 |-------------|---------|---------|-------------|
 | `--migration-limit` | `DYN_MIGRATION_LIMIT` | `0` | Max request migrations per worker disconnect. 0 = disabled |
-| `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | — | KV cache utilization fraction (0.0–1.0) for busy detection |
-| `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | — | Absolute token count for prefill busy detection |
-| `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | — | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold |
+| `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` | KV cache utilization fraction (0.0–1.0) for busy detection. Pass `None` to disable |
+| `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` | Absolute token count for prefill busy detection. Pass `None` to disable |
+| `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | `10.0` | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold. Pass `None` to disable |

 ## Model Discovery


--- a/docs/fault-tolerance/README.md
+++ b/docs/fault-tolerance/README.md
@@ -78,8 +78,9 @@ See [Health Checks](../observability/health-checks.md) for details.
 | Canary health checks | `DYN_HEALTH_CHECK_ENABLED` | `false` |
 | Canary wait time | `DYN_CANARY_WAIT_TIME` | `10` seconds |
 | Health check timeout | `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | `3` seconds |
-| Decode blocks threshold | `--active-decode-blocks-threshold` | None (disabled) |
-| Prefill tokens threshold | `--active-prefill-tokens-threshold` | None (disabled) |
+| Decode blocks threshold | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` |
+| Prefill tokens threshold | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` |
+

 ## Failure Scenarios and Recovery


--- a/lib/llm/src/discovery/watcher.rs
+++ b/lib/llm/src/discovery/watcher.rs
@@ -876,8 +876,8 @@ impl ModelWatcher {
            let push_router = PushRouter::<
                NvCreateEmbeddingRequest,
                Annotated<NvCreateEmbeddingResponse>,
-            >::from_client_with_threshold(
-                client, self.router_config.router_mode, None, None
+            >::from_client_with_monitor(
+                client, self.router_config.router_mode, None
            )
            .await?;
            worker_set.embeddings_engine = Some(Arc::new(push_router));
@@ -896,11 +896,8 @@ impl ModelWatcher {
                let chat_router = PushRouter::<
                    NvCreateChatCompletionRequest,
                    Annotated<NvCreateChatCompletionStreamResponse>,
-                >::from_client_with_threshold(
-                    client.clone(),
-                    self.router_config.router_mode,
-                    None,
-                    None,
+                >::from_client_with_monitor(
+                    client.clone(), self.router_config.router_mode, None
                )
                .await?;
                worker_set.chat_engine = Some(Arc::new(chat_router));
@@ -910,8 +907,8 @@ impl ModelWatcher {
                let images_router = PushRouter::<
                    NvCreateImageRequest,
                    Annotated<NvImagesResponse>,
-                >::from_client_with_threshold(
-                    client.clone(), self.router_config.router_mode, None, None
+                >::from_client_with_monitor(
+                    client.clone(), self.router_config.router_mode, None
                )
                .await?;
                worker_set.images_engine = Some(Arc::new(images_router));
@@ -921,8 +918,8 @@ impl ModelWatcher {
                let videos_router = PushRouter::<
                    NvCreateVideoRequest,
                    Annotated<NvVideosResponse>,
-                >::from_client_with_threshold(
-                    client.clone(), self.router_config.router_mode, None, None
+                >::from_client_with_monitor(
+                    client.clone(), self.router_config.router_mode, None
                )
                .await?;
                worker_set.videos_engine = Some(Arc::new(videos_router));
@@ -932,11 +929,8 @@ impl ModelWatcher {
                let audios_router = PushRouter::<
                    NvCreateAudioSpeechRequest,
                    Annotated<NvAudioSpeechResponse>,
-                >::from_client_with_threshold(
-                    client.clone(),
-                    self.router_config.router_mode,
-                    None,
-                    None,
+                >::from_client_with_monitor(
+                    client.clone(), self.router_config.router_mode, None
                )
                .await?;
                worker_set.audios_engine = Some(Arc::new(audios_router));
@@ -946,8 +940,8 @@ impl ModelWatcher {
            let push_router = PushRouter::<
                NvCreateChatCompletionRequest,
                Annotated<NvCreateChatCompletionStreamResponse>,
-            >::from_client_with_threshold(
-                client, self.router_config.router_mode, None, None
+            >::from_client_with_monitor(
+                client, self.router_config.router_mode, None
            )
            .await?;
            worker_set.chat_engine = Some(Arc::new(push_router));
@@ -956,8 +950,8 @@ impl ModelWatcher {
            let push_router = PushRouter::<
                NvCreateCompletionRequest,
                Annotated<NvCreateCompletionResponse>,
-            >::from_client_with_threshold(
-                client, self.router_config.router_mode, None, None
+            >::from_client_with_monitor(
+                client, self.router_config.router_mode, None
            )
            .await?;
            worker_set.completions_engine = Some(Arc::new(push_router));
@@ -975,8 +969,8 @@ impl ModelWatcher {
            let router = PushRouter::<
                PreprocessedEmbeddingRequest,
                Annotated<EmbeddingsEngineOutput>,
-            >::from_client_with_threshold(
-                client, self.router_config.router_mode, None, None
+            >::from_client_with_monitor(
+                client, self.router_config.router_mode, None
            )
            .await?;

@@ -999,8 +993,8 @@ impl ModelWatcher {
            let push_router = PushRouter::<
                NvCreateTensorRequest,
                Annotated<NvCreateTensorResponse>,
-            >::from_client_with_threshold(
-                client, self.router_config.router_mode, None, None
+            >::from_client_with_monitor(
+                client, self.router_config.router_mode, None
            )
            .await?;
            worker_set.tensor_engine = Some(Arc::new(push_router));

--- a/lib/llm/src/discovery/worker_monitor.rs
+++ b/lib/llm/src/discovery/worker_monitor.rs
--- a/lib/llm/src/entrypoint/input/common.rs
+++ b/lib/llm/src/entrypoint/input/common.rs
@@ -322,19 +322,13 @@ where

    wait_for_min_initial_workers(&router_client, min_initial_workers).await?;

-    // Get threshold value and wrap monitor for PushRouter
-    // Note: PushRouter uses active_decode_blocks_threshold for its internal logic
-    let threshold_value = worker_monitor
-        .as_ref()
-        .map(|m| m.active_decode_blocks_threshold());
    let monitor_arc =
        worker_monitor.map(|m| Arc::new(m) as Arc<dyn dynamo_runtime::pipeline::WorkerLoadMonitor>);

    let router =
-        PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
+        PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
            router_client,
            router_mode,
-            threshold_value,
            monitor_arc,
        )
        .await?;

--- a/lib/llm/src/kv_router/prefill_router/activation.rs
+++ b/lib/llm/src/kv_router/prefill_router/activation.rs
@@ -149,10 +149,9 @@ impl PrefillRouter {
            self.register_prefill_client(model_manager.as_ref(), &client);

            // Build the PushRouter for prefill with KV mode using the shared client
-            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
+            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
                client,
                RouterMode::KV,
-                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;
@@ -167,10 +166,9 @@ impl PrefillRouter {
            // Create simple push router with the frontend's router mode
            // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
            // available in KV routing mode where the router has actual bookkeeping.
-            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
+            let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
                client,
                self.router_mode,
-                None, // busy_threshold
                None, // worker_monitor
            )
            .await?;

--- a/lib/runtime/src/pipeline/network/egress/push_router.rs
+++ b/lib/runtime/src/pipeline/network/egress/push_router.rs
@@ -134,10 +134,6 @@ where
    /// addresses it, then passes it to AddressedPushRouter which does the network traffic.
    addressed: Arc<AddressedPushRouter>,

-    /// Threshold for determining when a worker is busy (0.0 to 1.0)
-    /// If None, busy detection is disabled
-    busy_threshold: Option<f64>,
-
    /// When false, `generate_with_fault_detection` skips fault detection logic:
    /// it won't call `report_instance_down` on errors, and it uses the raw discovery
    /// instance list instead of the filtered avail list. Use for recovery/query paths
@@ -275,9 +271,9 @@ where
    T: Data + Serialize,
    U: Data + for<'de> Deserialize<'de> + MaybeError,
 {
-    /// Create a new PushRouter without busy threshold (no busy detection)
+    /// Create a new PushRouter without a worker load monitor (no busy detection)
    pub async fn from_client(client: Client, router_mode: RouterMode) -> anyhow::Result<Self> {
-        Self::from_client_with_threshold(client, router_mode, None, None).await
+        Self::from_client_with_monitor(client, router_mode, None).await
    }

    /// Create a new PushRouter with fault detection disabled.
@@ -307,7 +303,6 @@ where
            addressed,
            router_mode,
            round_robin_counter: Arc::new(AtomicU64::new(0)),
-            busy_threshold: None,
            fault_detection_enabled: false,
            response_timeout: response_inactivity_timeout(),
            occupancy_state,
@@ -315,11 +310,15 @@ where
        })
    }

-    /// Create a new PushRouter with optional busy threshold and worker load monitor
-    pub async fn from_client_with_threshold(
+    /// Create a new PushRouter with an optional worker load monitor.
+    ///
+    /// The rejection path is gated by `fault_detection_enabled` (true here);
+    /// busy detection itself is driven by the monitor via `client.update_free_instances(...)`.
+    /// If no thresholds are configured on the monitor (or no monitor is provided),
+    /// `client.instance_ids_free()` returns all instances and the gate never rejects.
+    pub async fn from_client_with_monitor(
        client: Client,
        router_mode: RouterMode,
-        busy_threshold: Option<f64>,
        worker_monitor: Option<Arc<dyn WorkerLoadMonitor>>,
    ) -> anyhow::Result<Self> {
        let addressed = addressed_router(&client.endpoint).await?;
@@ -345,7 +344,6 @@ where
            addressed,
            router_mode,
            round_robin_counter: Arc::new(AtomicU64::new(0)),
-            busy_threshold,
            fault_detection_enabled: true,
            response_timeout: response_inactivity_timeout(),
            occupancy_state,
@@ -668,8 +666,8 @@ where
            )
        };

-        // Check if all workers are busy (only if busy threshold is set and fault detection enabled)
-        if self.fault_detection_enabled && self.busy_threshold.is_some() {
+        // Check if all workers are busy (when fault detection is enabled).
+        if self.fault_detection_enabled {
            let free_instances = self.client.instance_ids_free();
            if free_instances.is_empty() {
                // Check if we actually have any instances at all