Unverified Commit adfc02d5 authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

feat: (Router) Guard threshold-based rejection logic when CLI thresholds are unset (#8333)

parent 8fd7de9a
...@@ -45,6 +45,20 @@ def validate_model_path(value: str) -> str: ...@@ -45,6 +45,20 @@ def validate_model_path(value: str) -> str:
return value return value
def _nullable_float(value: str) -> Optional[float]:
"""Parse a float, or return None for the literal 'None'."""
if value is None or value == "None":
return None
return float(value)
def _nullable_int(value: str) -> Optional[int]:
"""Parse an int, or return None for the literal 'None'."""
if value is None or value == "None":
return None
return int(value)
class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase): class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
"""Configuration for the Dynamo frontend.""" """Configuration for the Dynamo frontend."""
...@@ -331,36 +345,37 @@ class FrontendArgGroup(ArgGroup): ...@@ -331,36 +345,37 @@ class FrontendArgGroup(ArgGroup):
g, g,
flag_name="--active-decode-blocks-threshold", flag_name="--active-decode-blocks-threshold",
env_var="DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD", env_var="DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD",
default=None, default=1.0,
help=( help=(
"Threshold percentage (0.0-1.0) for determining when a worker is considered busy " "Threshold fraction (0.0-1.0) of KV cache block utilization above which a worker "
"based on KV cache block utilization. If not set, blocks-based busy detection is disabled." "is considered busy. Pass 'None' on the CLI to disable this check. Default: 1.0."
), ),
arg_type=float, arg_type=_nullable_float,
) )
add_argument( add_argument(
g, g,
flag_name="--active-prefill-tokens-threshold", flag_name="--active-prefill-tokens-threshold",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD", env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD",
default=None, default=10_000_000,
help=( help=(
"Literal token count threshold for determining when a worker is considered busy " "Literal token count threshold for determining when a worker is considered busy "
"based on prefill token utilization. When active prefill tokens exceed this " "based on prefill token utilization. When active prefill tokens exceed this "
"threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled." "threshold, the worker is marked as busy. Pass 'None' on the CLI to disable this "
"check. Uses OR logic with --active-prefill-tokens-threshold-frac. Default: 10000000."
), ),
arg_type=int, arg_type=_nullable_int,
) )
add_argument( add_argument(
g, g,
flag_name="--active-prefill-tokens-threshold-frac", flag_name="--active-prefill-tokens-threshold-frac",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC", env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC",
default=None, default=10.0,
help=( help=(
"Fraction of max_num_batched_tokens for busy detection. Worker is busy when " "Fraction of max_num_batched_tokens for busy detection. Worker is busy when "
"active_prefill_tokens > frac * max_num_batched_tokens. Default 1.5 (disabled). " "active_prefill_tokens > frac * max_num_batched_tokens. Pass 'None' on the CLI to "
"Uses OR logic with --active-prefill-tokens-threshold." "disable this check. Uses OR logic with --active-prefill-tokens-threshold. Default: 10.0."
), ),
arg_type=float, arg_type=_nullable_float,
) )
add_argument( add_argument(
g, g,
......
...@@ -68,9 +68,9 @@ When enabled, the frontend's embedded KV router predicts one expected prefill du ...@@ -68,9 +68,9 @@ When enabled, the frontend's embedded KV router predicts one expected prefill du
| CLI Argument | Env Var | Default | Description | | CLI Argument | Env Var | Default | Description |
|-------------|---------|---------|-------------| |-------------|---------|---------|-------------|
| `--migration-limit` | `DYN_MIGRATION_LIMIT` | `0` | Max request migrations per worker disconnect. 0 = disabled | | `--migration-limit` | `DYN_MIGRATION_LIMIT` | `0` | Max request migrations per worker disconnect. 0 = disabled |
| `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | | KV cache utilization fraction (0.0–1.0) for busy detection | | `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` | KV cache utilization fraction (0.0–1.0) for busy detection. Pass `None` to disable |
| `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | | Absolute token count for prefill busy detection | | `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` | Absolute token count for prefill busy detection. Pass `None` to disable |
| `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold | | `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | `10.0` | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold. Pass `None` to disable |
## Model Discovery ## Model Discovery
......
...@@ -78,8 +78,9 @@ See [Health Checks](../observability/health-checks.md) for details. ...@@ -78,8 +78,9 @@ See [Health Checks](../observability/health-checks.md) for details.
| Canary health checks | `DYN_HEALTH_CHECK_ENABLED` | `false` | | Canary health checks | `DYN_HEALTH_CHECK_ENABLED` | `false` |
| Canary wait time | `DYN_CANARY_WAIT_TIME` | `10` seconds | | Canary wait time | `DYN_CANARY_WAIT_TIME` | `10` seconds |
| Health check timeout | `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | `3` seconds | | Health check timeout | `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | `3` seconds |
| Decode blocks threshold | `--active-decode-blocks-threshold` | None (disabled) | | Decode blocks threshold | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` |
| Prefill tokens threshold | `--active-prefill-tokens-threshold` | None (disabled) | | Prefill tokens threshold | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` |
## Failure Scenarios and Recovery ## Failure Scenarios and Recovery
......
...@@ -876,8 +876,8 @@ impl ModelWatcher { ...@@ -876,8 +876,8 @@ impl ModelWatcher {
let push_router = PushRouter::< let push_router = PushRouter::<
NvCreateEmbeddingRequest, NvCreateEmbeddingRequest,
Annotated<NvCreateEmbeddingResponse>, Annotated<NvCreateEmbeddingResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client, self.router_config.router_mode, None, None client, self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.embeddings_engine = Some(Arc::new(push_router)); worker_set.embeddings_engine = Some(Arc::new(push_router));
...@@ -896,11 +896,8 @@ impl ModelWatcher { ...@@ -896,11 +896,8 @@ impl ModelWatcher {
let chat_router = PushRouter::< let chat_router = PushRouter::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>, Annotated<NvCreateChatCompletionStreamResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client.clone(), client.clone(), self.router_config.router_mode, None
self.router_config.router_mode,
None,
None,
) )
.await?; .await?;
worker_set.chat_engine = Some(Arc::new(chat_router)); worker_set.chat_engine = Some(Arc::new(chat_router));
...@@ -910,8 +907,8 @@ impl ModelWatcher { ...@@ -910,8 +907,8 @@ impl ModelWatcher {
let images_router = PushRouter::< let images_router = PushRouter::<
NvCreateImageRequest, NvCreateImageRequest,
Annotated<NvImagesResponse>, Annotated<NvImagesResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None, None client.clone(), self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.images_engine = Some(Arc::new(images_router)); worker_set.images_engine = Some(Arc::new(images_router));
...@@ -921,8 +918,8 @@ impl ModelWatcher { ...@@ -921,8 +918,8 @@ impl ModelWatcher {
let videos_router = PushRouter::< let videos_router = PushRouter::<
NvCreateVideoRequest, NvCreateVideoRequest,
Annotated<NvVideosResponse>, Annotated<NvVideosResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None, None client.clone(), self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.videos_engine = Some(Arc::new(videos_router)); worker_set.videos_engine = Some(Arc::new(videos_router));
...@@ -932,11 +929,8 @@ impl ModelWatcher { ...@@ -932,11 +929,8 @@ impl ModelWatcher {
let audios_router = PushRouter::< let audios_router = PushRouter::<
NvCreateAudioSpeechRequest, NvCreateAudioSpeechRequest,
Annotated<NvAudioSpeechResponse>, Annotated<NvAudioSpeechResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client.clone(), client.clone(), self.router_config.router_mode, None
self.router_config.router_mode,
None,
None,
) )
.await?; .await?;
worker_set.audios_engine = Some(Arc::new(audios_router)); worker_set.audios_engine = Some(Arc::new(audios_router));
...@@ -946,8 +940,8 @@ impl ModelWatcher { ...@@ -946,8 +940,8 @@ impl ModelWatcher {
let push_router = PushRouter::< let push_router = PushRouter::<
NvCreateChatCompletionRequest, NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>, Annotated<NvCreateChatCompletionStreamResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client, self.router_config.router_mode, None, None client, self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.chat_engine = Some(Arc::new(push_router)); worker_set.chat_engine = Some(Arc::new(push_router));
...@@ -956,8 +950,8 @@ impl ModelWatcher { ...@@ -956,8 +950,8 @@ impl ModelWatcher {
let push_router = PushRouter::< let push_router = PushRouter::<
NvCreateCompletionRequest, NvCreateCompletionRequest,
Annotated<NvCreateCompletionResponse>, Annotated<NvCreateCompletionResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client, self.router_config.router_mode, None, None client, self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.completions_engine = Some(Arc::new(push_router)); worker_set.completions_engine = Some(Arc::new(push_router));
...@@ -975,8 +969,8 @@ impl ModelWatcher { ...@@ -975,8 +969,8 @@ impl ModelWatcher {
let router = PushRouter::< let router = PushRouter::<
PreprocessedEmbeddingRequest, PreprocessedEmbeddingRequest,
Annotated<EmbeddingsEngineOutput>, Annotated<EmbeddingsEngineOutput>,
>::from_client_with_threshold( >::from_client_with_monitor(
client, self.router_config.router_mode, None, None client, self.router_config.router_mode, None
) )
.await?; .await?;
...@@ -999,8 +993,8 @@ impl ModelWatcher { ...@@ -999,8 +993,8 @@ impl ModelWatcher {
let push_router = PushRouter::< let push_router = PushRouter::<
NvCreateTensorRequest, NvCreateTensorRequest,
Annotated<NvCreateTensorResponse>, Annotated<NvCreateTensorResponse>,
>::from_client_with_threshold( >::from_client_with_monitor(
client, self.router_config.router_mode, None, None client, self.router_config.router_mode, None
) )
.await?; .await?;
worker_set.tensor_engine = Some(Arc::new(push_router)); worker_set.tensor_engine = Some(Arc::new(push_router));
......
This diff is collapsed.
...@@ -322,19 +322,13 @@ where ...@@ -322,19 +322,13 @@ where
wait_for_min_initial_workers(&router_client, min_initial_workers).await?; wait_for_min_initial_workers(&router_client, min_initial_workers).await?;
// Get threshold value and wrap monitor for PushRouter
// Note: PushRouter uses active_decode_blocks_threshold for its internal logic
let threshold_value = worker_monitor
.as_ref()
.map(|m| m.active_decode_blocks_threshold());
let monitor_arc = let monitor_arc =
worker_monitor.map(|m| Arc::new(m) as Arc<dyn dynamo_runtime::pipeline::WorkerLoadMonitor>); worker_monitor.map(|m| Arc::new(m) as Arc<dyn dynamo_runtime::pipeline::WorkerLoadMonitor>);
let router = let router =
PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold( PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
router_client, router_client,
router_mode, router_mode,
threshold_value,
monitor_arc, monitor_arc,
) )
.await?; .await?;
......
...@@ -149,10 +149,9 @@ impl PrefillRouter { ...@@ -149,10 +149,9 @@ impl PrefillRouter {
self.register_prefill_client(model_manager.as_ref(), &client); self.register_prefill_client(model_manager.as_ref(), &client);
// Build the PushRouter for prefill with KV mode using the shared client // Build the PushRouter for prefill with KV mode using the shared client
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold( let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
client, client,
RouterMode::KV, RouterMode::KV,
None, // busy_threshold
None, // worker_monitor None, // worker_monitor
) )
.await?; .await?;
...@@ -167,10 +166,9 @@ impl PrefillRouter { ...@@ -167,10 +166,9 @@ impl PrefillRouter {
// Create simple push router with the frontend's router mode // Create simple push router with the frontend's router mode
// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only // Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
// available in KV routing mode where the router has actual bookkeeping. // available in KV routing mode where the router has actual bookkeeping.
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold( let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
client, client,
self.router_mode, self.router_mode,
None, // busy_threshold
None, // worker_monitor None, // worker_monitor
) )
.await?; .await?;
......
...@@ -134,10 +134,6 @@ where ...@@ -134,10 +134,6 @@ where
/// addresses it, then passes it to AddressedPushRouter which does the network traffic. /// addresses it, then passes it to AddressedPushRouter which does the network traffic.
addressed: Arc<AddressedPushRouter>, addressed: Arc<AddressedPushRouter>,
/// Threshold for determining when a worker is busy (0.0 to 1.0)
/// If None, busy detection is disabled
busy_threshold: Option<f64>,
/// When false, `generate_with_fault_detection` skips fault detection logic: /// When false, `generate_with_fault_detection` skips fault detection logic:
/// it won't call `report_instance_down` on errors, and it uses the raw discovery /// it won't call `report_instance_down` on errors, and it uses the raw discovery
/// instance list instead of the filtered avail list. Use for recovery/query paths /// instance list instead of the filtered avail list. Use for recovery/query paths
...@@ -275,9 +271,9 @@ where ...@@ -275,9 +271,9 @@ where
T: Data + Serialize, T: Data + Serialize,
U: Data + for<'de> Deserialize<'de> + MaybeError, U: Data + for<'de> Deserialize<'de> + MaybeError,
{ {
/// Create a new PushRouter without busy threshold (no busy detection) /// Create a new PushRouter without a worker load monitor (no busy detection)
pub async fn from_client(client: Client, router_mode: RouterMode) -> anyhow::Result<Self> { pub async fn from_client(client: Client, router_mode: RouterMode) -> anyhow::Result<Self> {
Self::from_client_with_threshold(client, router_mode, None, None).await Self::from_client_with_monitor(client, router_mode, None).await
} }
/// Create a new PushRouter with fault detection disabled. /// Create a new PushRouter with fault detection disabled.
...@@ -307,7 +303,6 @@ where ...@@ -307,7 +303,6 @@ where
addressed, addressed,
router_mode, router_mode,
round_robin_counter: Arc::new(AtomicU64::new(0)), round_robin_counter: Arc::new(AtomicU64::new(0)),
busy_threshold: None,
fault_detection_enabled: false, fault_detection_enabled: false,
response_timeout: response_inactivity_timeout(), response_timeout: response_inactivity_timeout(),
occupancy_state, occupancy_state,
...@@ -315,11 +310,15 @@ where ...@@ -315,11 +310,15 @@ where
}) })
} }
/// Create a new PushRouter with optional busy threshold and worker load monitor /// Create a new PushRouter with an optional worker load monitor.
pub async fn from_client_with_threshold( ///
/// The rejection path is gated by `fault_detection_enabled` (true here);
/// busy detection itself is driven by the monitor via `client.update_free_instances(...)`.
/// If no thresholds are configured on the monitor (or no monitor is provided),
/// `client.instance_ids_free()` returns all instances and the gate never rejects.
pub async fn from_client_with_monitor(
client: Client, client: Client,
router_mode: RouterMode, router_mode: RouterMode,
busy_threshold: Option<f64>,
worker_monitor: Option<Arc<dyn WorkerLoadMonitor>>, worker_monitor: Option<Arc<dyn WorkerLoadMonitor>>,
) -> anyhow::Result<Self> { ) -> anyhow::Result<Self> {
let addressed = addressed_router(&client.endpoint).await?; let addressed = addressed_router(&client.endpoint).await?;
...@@ -345,7 +344,6 @@ where ...@@ -345,7 +344,6 @@ where
addressed, addressed,
router_mode, router_mode,
round_robin_counter: Arc::new(AtomicU64::new(0)), round_robin_counter: Arc::new(AtomicU64::new(0)),
busy_threshold,
fault_detection_enabled: true, fault_detection_enabled: true,
response_timeout: response_inactivity_timeout(), response_timeout: response_inactivity_timeout(),
occupancy_state, occupancy_state,
...@@ -668,8 +666,8 @@ where ...@@ -668,8 +666,8 @@ where
) )
}; };
// Check if all workers are busy (only if busy threshold is set and fault detection enabled) // Check if all workers are busy (when fault detection is enabled).
if self.fault_detection_enabled && self.busy_threshold.is_some() { if self.fault_detection_enabled {
let free_instances = self.client.instance_ids_free(); let free_instances = self.client.instance_ids_free();
if free_instances.is_empty() { if free_instances.is_empty() {
// Check if we actually have any instances at all // Check if we actually have any instances at all
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment