Unverified Commit 7c15166d authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: Disable health checks by default; auto-enable in K8s via operator (#4804)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
parent a473402f
...@@ -827,6 +827,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -827,6 +827,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"}, Args: []string{"ray start --head --port=6379 && some dynamo command --tensor-parallel-size 4 --pipeline-parallel-size 1"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"}, {Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
...@@ -955,6 +956,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing. ...@@ -955,6 +956,7 @@ func TestDynamoComponentDeploymentReconciler_generateLeaderWorkerSet(t *testing.
Args: []string{"ray start --address=$LWS_LEADER_ADDRESS:6379 --block"}, Args: []string{"ray start --address=$LWS_LEADER_ADDRESS:6379 --block"},
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker}, {Name: commonconsts.DynamoComponentEnvVar, Value: commonconsts.ComponentTypeWorker},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: "default"},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-lws-deploy"},
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"}, {Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
......
...@@ -86,6 +86,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont ...@@ -86,6 +86,10 @@ func (w *WorkerDefaults) GetBaseContainer(context ComponentContext) (corev1.Cont
Name: "DYN_SYSTEM_PORT", Name: "DYN_SYSTEM_PORT",
Value: fmt.Sprintf("%d", commonconsts.DynamoSystemPort), Value: fmt.Sprintf("%d", commonconsts.DynamoSystemPort),
}, },
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
},
}...) }...)
return container, nil return container, nil
......
...@@ -1963,6 +1963,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -1963,6 +1963,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar, Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker, Value: commonconsts.ComponentTypeWorker,
}, },
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -2140,6 +2144,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2140,6 +2144,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar, Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker, Value: commonconsts.ComponentTypeWorker,
}, },
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -2864,6 +2872,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -2864,6 +2872,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar, Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker, Value: commonconsts.ComponentTypeWorker,
}, },
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -3028,6 +3040,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) { ...@@ -3028,6 +3040,10 @@ func TestGenerateGrovePodCliqueSet(t *testing.T) {
Name: commonconsts.DynamoComponentEnvVar, Name: commonconsts.DynamoComponentEnvVar,
Value: commonconsts.ComponentTypeWorker, Value: commonconsts.ComponentTypeWorker,
}, },
{
Name: "DYN_HEALTH_CHECK_ENABLED",
Value: "true",
},
{ {
Name: "DYN_PARENT_DGD_K8S_NAME", Name: "DYN_PARENT_DGD_K8S_NAME",
Value: "test-dynamo-graph-deployment", Value: "test-dynamo-graph-deployment",
...@@ -4989,6 +5005,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) { ...@@ -4989,6 +5005,7 @@ func TestGenerateBasePodSpec_Worker(t *testing.T) {
{Name: "ANOTHER_COMPONENTENV", Value: "true"}, {Name: "ANOTHER_COMPONENTENV", Value: "true"},
{Name: "ANOTHER_CONTAINER_ENV", Value: "true"}, {Name: "ANOTHER_CONTAINER_ENV", Value: "true"},
{Name: commonconsts.DynamoComponentEnvVar, Value: "worker"}, {Name: commonconsts.DynamoComponentEnvVar, Value: "worker"},
{Name: "DYN_HEALTH_CHECK_ENABLED", Value: "true"},
{Name: commonconsts.DynamoNamespaceEnvVar, Value: ""}, {Name: commonconsts.DynamoNamespaceEnvVar, Value: ""},
{Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"}, {Name: "DYN_PARENT_DGD_K8S_NAME", Value: "test-deployment"},
{Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"}, {Name: "DYN_PARENT_DGD_K8S_NAMESPACE", Value: "default"},
......
...@@ -20,6 +20,9 @@ orchestration frameworks such as Kubernetes. ...@@ -20,6 +20,9 @@ orchestration frameworks such as Kubernetes.
| `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` | | `DYN_SYSTEM_HEALTH_PATH` | Custom health endpoint path | `/health` | `/custom/health` |
| `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` | | `DYN_SYSTEM_LIVE_PATH` | Custom liveness endpoint path | `/live` | `/custom/live` |
| `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` | | `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` | Endpoints required for ready state | none | `["generate"]` |
| `DYN_HEALTH_CHECK_ENABLED` | Enable canary health checks | `false` (K8s: `true`) | `true`, `false` |
| `DYN_CANARY_WAIT_TIME` | Seconds before sending canary health check | `10` | `5`, `30` |
| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Health check request timeout in seconds | `3` | `5`, `10` |
## Getting Started Quickly ## Getting Started Quickly
...@@ -213,6 +216,127 @@ date: Wed, 03 Sep 2025 13:42:45 GMT ...@@ -213,6 +216,127 @@ date: Wed, 03 Sep 2025 13:42:45 GMT
} }
``` ```
## Canary Health Checks (Active Monitoring)
In addition to the HTTP endpoints described above, Dynamo includes a **canary health check** system that actively monitors worker endpoints.
### Overview
The canary health check system:
- **Monitors endpoint health** by sending periodic test requests to worker endpoints
- **Only activates during idle periods** - if there's ongoing traffic, health checks are skipped to avoid overhead
- **Automatically enabled in Kubernetes** deployments via the operator
- **Disabled by default** in local/development environments
### How It Works
1. **Idle Detection**: After no activity on an endpoint for a configurable wait time (default: 10 seconds), a canary health check is triggered
2. **Health Check Request**: A lightweight test request is sent to the endpoint with a minimal payload (generates 1 token)
3. **Activity Resets Timer**: If normal requests arrive, the canary timer resets and no health check is sent
4. **Timeout Handling**: If a health check doesn't respond within the timeout (default: 3 seconds), the endpoint is marked as unhealthy
### Configuration
#### In Kubernetes (Enabled by Default)
Health checks are automatically enabled by the Dynamo operator. No additional configuration is required.
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: my-deployment
spec:
services:
VllmWorker:
componentType: worker
replicas: 2
# Health checks automatically enabled by operator
```
#### In Local/Development Environments (Disabled by Default)
To enable health checks locally:
```bash
# Enable health checks
export DYN_HEALTH_CHECK_ENABLED=true
# Optional: Customize timing
export DYN_CANARY_WAIT_TIME=5 # Wait 5 seconds before sending health check
export DYN_HEALTH_CHECK_REQUEST_TIMEOUT=5 # 5 second timeout
# Start worker
python -m dynamo.vllm --model Qwen/Qwen3-0.6B
```
#### Configuration Options
| Environment Variable | Description | Default | Notes |
|---------------------|-------------|---------|-------|
| `DYN_HEALTH_CHECK_ENABLED` | Enable/disable canary health checks | `false` (K8s: `true`) | Automatically set to `true` in K8s |
| `DYN_CANARY_WAIT_TIME` | Seconds to wait (during idle) before sending health check | `10` | Lower values = more frequent checks |
| `DYN_HEALTH_CHECK_REQUEST_TIMEOUT` | Max seconds to wait for health check response | `3` | Higher values = more tolerance for slow responses |
### Health Check Payloads
Each backend defines its own minimal health check payload:
- **vLLM**: Single token generation with minimal sampling options
- **TensorRT-LLM**: Single token with BOS token ID
- **SGLang**: Single token generation request
These payloads are designed to:
- Complete quickly (< 100ms typically)
- Minimize GPU overhead
- Verify the full inference stack is working
### Observing Health Checks
When health checks are enabled, you'll see logs like:
```
INFO Health check manager started (canary_wait_time: 10s, request_timeout: 3s)
INFO Spawned health check task for endpoint: generate
INFO Canary timer expired for generate, sending health check
INFO Health check successful for generate
```
If an endpoint fails:
```
WARN Health check timeout for generate
ERROR Health check request failed for generate: connection refused
```
### When to Use Canary Health Checks
**Enable in production (Kubernetes):**
- ✅ Detect unhealthy workers before they affect user traffic
- ✅ Enable faster failure detection and recovery
- ✅ Monitor worker availability continuously
**Disable in development:**
- ✅ Reduce log noise during debugging
- ✅ Avoid overhead when not needed
- ✅ Simplify local testing
### Troubleshooting
**Health checks timing out:**
- Increase `DYN_HEALTH_CHECK_REQUEST_TIMEOUT`
- Check worker logs for errors
- Verify network connectivity
**Too many health check logs:**
- Increase `DYN_CANARY_WAIT_TIME` to reduce frequency
- Or disable with `DYN_HEALTH_CHECK_ENABLED=false` in dev
**Health checks not running:**
- Verify `DYN_HEALTH_CHECK_ENABLED=true` is set
- Check that `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` includes the endpoint
- Ensure the worker is serving the endpoint
## Related Documentation ## Related Documentation
- [Distributed Runtime Architecture](../design_docs/distributed_runtime.md) - [Distributed Runtime Architecture](../design_docs/distributed_runtime.md)
......
...@@ -162,7 +162,7 @@ pub struct RuntimeConfig { ...@@ -162,7 +162,7 @@ pub struct RuntimeConfig {
/// Enable active health checking with payloads /// Enable active health checking with payloads
/// Set this at runtime with environment variable DYN_HEALTH_CHECK_ENABLED /// Set this at runtime with environment variable DYN_HEALTH_CHECK_ENABLED
#[builder(default = "true")] #[builder(default = "false")]
#[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))] #[builder_field_attr(serde(skip_serializing_if = "Option::is_none"))]
pub health_check_enabled: bool, pub health_check_enabled: bool,
...@@ -358,7 +358,7 @@ impl RuntimeConfig { ...@@ -358,7 +358,7 @@ impl RuntimeConfig {
compute_threads: Some(1), compute_threads: Some(1),
compute_stack_size: Some(2 * 1024 * 1024), compute_stack_size: Some(2 * 1024 * 1024),
compute_thread_prefix: "compute".to_string(), compute_thread_prefix: "compute".to_string(),
health_check_enabled: true, health_check_enabled: false,
canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS, canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS, health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
} }
...@@ -394,7 +394,7 @@ impl Default for RuntimeConfig { ...@@ -394,7 +394,7 @@ impl Default for RuntimeConfig {
compute_threads: None, compute_threads: None,
compute_stack_size: Some(2 * 1024 * 1024), compute_stack_size: Some(2 * 1024 * 1024),
compute_thread_prefix: "compute".to_string(), compute_thread_prefix: "compute".to_string(),
health_check_enabled: true, health_check_enabled: false,
canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS, canary_wait_time_secs: DEFAULT_CANARY_WAIT_TIME_SECS,
health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS, health_check_request_timeout_secs: DEFAULT_HEALTH_CHECK_REQUEST_TIMEOUT_SECS,
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment