Unverified Commit adfc02d5 authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

feat: (Router) Guard threshold-based rejection logic when CLI thresholds are unset (#8333)

parent 8fd7de9a
......@@ -45,6 +45,20 @@ def validate_model_path(value: str) -> str:
return value
def _nullable_float(value: str) -> Optional[float]:
"""Parse a float, or return None for the literal 'None'."""
if value is None or value == "None":
return None
return float(value)
def _nullable_int(value: str) -> Optional[int]:
"""Parse an int, or return None for the literal 'None'."""
if value is None or value == "None":
return None
return int(value)
class FrontendConfig(KvRouterConfigBase, AicPerfConfigBase):
"""Configuration for the Dynamo frontend."""
......@@ -331,36 +345,37 @@ class FrontendArgGroup(ArgGroup):
g,
flag_name="--active-decode-blocks-threshold",
env_var="DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD",
default=None,
default=1.0,
help=(
"Threshold percentage (0.0-1.0) for determining when a worker is considered busy "
"based on KV cache block utilization. If not set, blocks-based busy detection is disabled."
"Threshold fraction (0.0-1.0) of KV cache block utilization above which a worker "
"is considered busy. Pass 'None' on the CLI to disable this check. Default: 1.0."
),
arg_type=float,
arg_type=_nullable_float,
)
add_argument(
g,
flag_name="--active-prefill-tokens-threshold",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD",
default=None,
default=10_000_000,
help=(
"Literal token count threshold for determining when a worker is considered busy "
"based on prefill token utilization. When active prefill tokens exceed this "
"threshold, the worker is marked as busy. If not set, tokens-based busy detection is disabled."
"threshold, the worker is marked as busy. Pass 'None' on the CLI to disable this "
"check. Uses OR logic with --active-prefill-tokens-threshold-frac. Default: 10000000."
),
arg_type=int,
arg_type=_nullable_int,
)
add_argument(
g,
flag_name="--active-prefill-tokens-threshold-frac",
env_var="DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC",
default=None,
default=10.0,
help=(
"Fraction of max_num_batched_tokens for busy detection. Worker is busy when "
"active_prefill_tokens > frac * max_num_batched_tokens. Default 1.5 (disabled). "
"Uses OR logic with --active-prefill-tokens-threshold."
"active_prefill_tokens > frac * max_num_batched_tokens. Pass 'None' on the CLI to "
"disable this check. Uses OR logic with --active-prefill-tokens-threshold. Default: 10.0."
),
arg_type=float,
arg_type=_nullable_float,
)
add_argument(
g,
......
......@@ -68,9 +68,9 @@ When enabled, the frontend's embedded KV router predicts one expected prefill du
| CLI Argument | Env Var | Default | Description |
|-------------|---------|---------|-------------|
| `--migration-limit` | `DYN_MIGRATION_LIMIT` | `0` | Max request migrations per worker disconnect. 0 = disabled |
| `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | | KV cache utilization fraction (0.0–1.0) for busy detection |
| `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | | Absolute token count for prefill busy detection |
| `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold |
| `--active-decode-blocks-threshold` | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` | KV cache utilization fraction (0.0–1.0) for busy detection. Pass `None` to disable |
| `--active-prefill-tokens-threshold` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` | Absolute token count for prefill busy detection. Pass `None` to disable |
| `--active-prefill-tokens-threshold-frac` | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD_FRAC` | `10.0` | Fraction of `max_num_batched_tokens` for prefill busy detection. OR logic with absolute threshold. Pass `None` to disable |
## Model Discovery
......
......@@ -78,8 +78,9 @@ See [Health Checks](../observability/health-checks.md) for details.
| Canary health checks | `DYN_HEALTH_CHECK_ENABLED` | `false` |
| Canary wait time | `DYN_CANARY_WAIT_TIME` | `10` seconds |
| Health check timeout | `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | `3` seconds |
| Decode blocks threshold | `--active-decode-blocks-threshold` | None (disabled) |
| Prefill tokens threshold | `--active-prefill-tokens-threshold` | None (disabled) |
| Decode blocks threshold | `DYN_ACTIVE_DECODE_BLOCKS_THRESHOLD` | `1.0` |
| Prefill tokens threshold | `DYN_ACTIVE_PREFILL_TOKENS_THRESHOLD` | `10000000` |
## Failure Scenarios and Recovery
......
......@@ -876,8 +876,8 @@ impl ModelWatcher {
let push_router = PushRouter::<
NvCreateEmbeddingRequest,
Annotated<NvCreateEmbeddingResponse>,
>::from_client_with_threshold(
client, self.router_config.router_mode, None, None
>::from_client_with_monitor(
client, self.router_config.router_mode, None
)
.await?;
worker_set.embeddings_engine = Some(Arc::new(push_router));
......@@ -896,11 +896,8 @@ impl ModelWatcher {
let chat_router = PushRouter::<
NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>,
>::from_client_with_threshold(
client.clone(),
self.router_config.router_mode,
None,
None,
>::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None
)
.await?;
worker_set.chat_engine = Some(Arc::new(chat_router));
......@@ -910,8 +907,8 @@ impl ModelWatcher {
let images_router = PushRouter::<
NvCreateImageRequest,
Annotated<NvImagesResponse>,
>::from_client_with_threshold(
client.clone(), self.router_config.router_mode, None, None
>::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None
)
.await?;
worker_set.images_engine = Some(Arc::new(images_router));
......@@ -921,8 +918,8 @@ impl ModelWatcher {
let videos_router = PushRouter::<
NvCreateVideoRequest,
Annotated<NvVideosResponse>,
>::from_client_with_threshold(
client.clone(), self.router_config.router_mode, None, None
>::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None
)
.await?;
worker_set.videos_engine = Some(Arc::new(videos_router));
......@@ -932,11 +929,8 @@ impl ModelWatcher {
let audios_router = PushRouter::<
NvCreateAudioSpeechRequest,
Annotated<NvAudioSpeechResponse>,
>::from_client_with_threshold(
client.clone(),
self.router_config.router_mode,
None,
None,
>::from_client_with_monitor(
client.clone(), self.router_config.router_mode, None
)
.await?;
worker_set.audios_engine = Some(Arc::new(audios_router));
......@@ -946,8 +940,8 @@ impl ModelWatcher {
let push_router = PushRouter::<
NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>,
>::from_client_with_threshold(
client, self.router_config.router_mode, None, None
>::from_client_with_monitor(
client, self.router_config.router_mode, None
)
.await?;
worker_set.chat_engine = Some(Arc::new(push_router));
......@@ -956,8 +950,8 @@ impl ModelWatcher {
let push_router = PushRouter::<
NvCreateCompletionRequest,
Annotated<NvCreateCompletionResponse>,
>::from_client_with_threshold(
client, self.router_config.router_mode, None, None
>::from_client_with_monitor(
client, self.router_config.router_mode, None
)
.await?;
worker_set.completions_engine = Some(Arc::new(push_router));
......@@ -975,8 +969,8 @@ impl ModelWatcher {
let router = PushRouter::<
PreprocessedEmbeddingRequest,
Annotated<EmbeddingsEngineOutput>,
>::from_client_with_threshold(
client, self.router_config.router_mode, None, None
>::from_client_with_monitor(
client, self.router_config.router_mode, None
)
.await?;
......@@ -999,8 +993,8 @@ impl ModelWatcher {
let push_router = PushRouter::<
NvCreateTensorRequest,
Annotated<NvCreateTensorResponse>,
>::from_client_with_threshold(
client, self.router_config.router_mode, None, None
>::from_client_with_monitor(
client, self.router_config.router_mode, None
)
.await?;
worker_set.tensor_engine = Some(Arc::new(push_router));
......
This diff is collapsed.
......@@ -322,19 +322,13 @@ where
wait_for_min_initial_workers(&router_client, min_initial_workers).await?;
// Get threshold value and wrap monitor for PushRouter
// Note: PushRouter uses active_decode_blocks_threshold for its internal logic
let threshold_value = worker_monitor
.as_ref()
.map(|m| m.active_decode_blocks_threshold());
let monitor_arc =
worker_monitor.map(|m| Arc::new(m) as Arc<dyn dynamo_runtime::pipeline::WorkerLoadMonitor>);
let router =
PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
router_client,
router_mode,
threshold_value,
monitor_arc,
)
.await?;
......
......@@ -149,10 +149,9 @@ impl PrefillRouter {
self.register_prefill_client(model_manager.as_ref(), &client);
// Build the PushRouter for prefill with KV mode using the shared client
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
client,
RouterMode::KV,
None, // busy_threshold
None, // worker_monitor
)
.await?;
......@@ -167,10 +166,9 @@ impl PrefillRouter {
// Create simple push router with the frontend's router mode
// Note: Per-worker metrics (active_prefill_tokens, active_decode_blocks) are only
// available in KV routing mode where the router has actual bookkeeping.
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_threshold(
let push_router = PushRouter::<PreprocessedRequest, Annotated<LLMEngineOutput>>::from_client_with_monitor(
client,
self.router_mode,
None, // busy_threshold
None, // worker_monitor
)
.await?;
......
......@@ -134,10 +134,6 @@ where
/// addresses it, then passes it to AddressedPushRouter which does the network traffic.
addressed: Arc<AddressedPushRouter>,
/// Threshold for determining when a worker is busy (0.0 to 1.0)
/// If None, busy detection is disabled
busy_threshold: Option<f64>,
/// When false, `generate_with_fault_detection` skips fault detection logic:
/// it won't call `report_instance_down` on errors, and it uses the raw discovery
/// instance list instead of the filtered avail list. Use for recovery/query paths
......@@ -275,9 +271,9 @@ where
T: Data + Serialize,
U: Data + for<'de> Deserialize<'de> + MaybeError,
{
/// Create a new PushRouter without busy threshold (no busy detection)
/// Create a new PushRouter without a worker load monitor (no busy detection)
pub async fn from_client(client: Client, router_mode: RouterMode) -> anyhow::Result<Self> {
Self::from_client_with_threshold(client, router_mode, None, None).await
Self::from_client_with_monitor(client, router_mode, None).await
}
/// Create a new PushRouter with fault detection disabled.
......@@ -307,7 +303,6 @@ where
addressed,
router_mode,
round_robin_counter: Arc::new(AtomicU64::new(0)),
busy_threshold: None,
fault_detection_enabled: false,
response_timeout: response_inactivity_timeout(),
occupancy_state,
......@@ -315,11 +310,15 @@ where
})
}
/// Create a new PushRouter with optional busy threshold and worker load monitor
pub async fn from_client_with_threshold(
/// Create a new PushRouter with an optional worker load monitor.
///
/// The rejection path is gated by `fault_detection_enabled` (true here);
/// busy detection itself is driven by the monitor via `client.update_free_instances(...)`.
/// If no thresholds are configured on the monitor (or no monitor is provided),
/// `client.instance_ids_free()` returns all instances and the gate never rejects.
pub async fn from_client_with_monitor(
client: Client,
router_mode: RouterMode,
busy_threshold: Option<f64>,
worker_monitor: Option<Arc<dyn WorkerLoadMonitor>>,
) -> anyhow::Result<Self> {
let addressed = addressed_router(&client.endpoint).await?;
......@@ -345,7 +344,6 @@ where
addressed,
router_mode,
round_robin_counter: Arc::new(AtomicU64::new(0)),
busy_threshold,
fault_detection_enabled: true,
response_timeout: response_inactivity_timeout(),
occupancy_state,
......@@ -668,8 +666,8 @@ where
)
};
// Check if all workers are busy (only if busy threshold is set and fault detection enabled)
if self.fault_detection_enabled && self.busy_threshold.is_some() {
// Check if all workers are busy (when fault detection is enabled).
if self.fault_detection_enabled {
let free_instances = self.client.instance_ids_free();
if free_instances.is_empty() {
// Check if we actually have any instances at all
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment